This is an R Markdown Notebook. When you execute code within the notebook, the results appear beneath the code.

Try executing this chunk by clicking the Run button within the chunk or by placing your cursor inside it and pressing Cmd+Shift+Enter.

##all viruses - target and off target

library(tidyverse)
── Attaching core tidyverse packages ─── tidyverse 2.0.0 ──
✔ dplyr     1.1.4     ✔ readr     2.1.5
✔ forcats   1.0.0     ✔ stringr   1.5.1
✔ ggplot2   3.5.1     ✔ tibble    3.2.1
✔ lubridate 1.9.4     ✔ tidyr     1.3.1
✔ purrr     1.0.4     ── Conflicts ───────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the ]8;;http://conflicted.r-lib.org/conflicted package]8;; to force all conflicts to become errors
library(ggpubr)
library(tidyplots)

Attaching package: ‘tidyplots’

The following object is masked from ‘package:ggpubr’:

    gene_expression
library(ggthemes)
library(scales)

Attaching package: ‘scales’

The following object is masked from ‘package:purrr’:

    discard

The following object is masked from ‘package:readr’:

    col_factor

Figure 1 - linear model

Read-in data

#Linear model for TE data - separated by virus
#November 2024

#viral read depth
#use bowtie2 data

library(tidyverse)
library(broom)
library(boot)

####read counts/normalised read counts####

#metadata

metadata <-
  read.csv(
    "metadata/sampleIDs_TESpikeIn.csv",
    header = TRUE
  )

metadata2 <- metadata |>
  select(Sample.ID, Number.of.read.pairs..quality.adaptor.trimmed.) |>
  rename(Sample_id = Sample.ID) |>
  rename(QC_reads = Number.of.read.pairs..quality.adaptor.trimmed.)

#import and combine read count files
#deduplicated and non-deduplicated

counts_dedup_bt <-
  read.table(
    "data_TE/TE_sequencing_experiment_readcount_per_sample_and_virus_bowtie2_dedup_atcc_ref.tsv",
    sep = "\t",
    header = TRUE
  )

counts_nodedup_bt <-
  read.table(
    "data_TE/TE_sequencing_experiment_readcount_per_sample_and_virus_bowtie2_nodedup_atcc_ref.tsv",
    sep = "\t",
    header = TRUE
  )

counts_bt_all <- rbind(counts_dedup_bt, counts_nodedup_bt)

counts_reads <- left_join(counts_bt_all, metadata2, by = "Sample_id")

####read depths####

#import and combine read depth files

depth_dedup_bt <-
  read.table(
    "data_TE/TE_sequencing_experiment_readdepth_per_sample_and_virus_bowtie2_dedup_atcc_ref.tsv",
    sep = "\t",
    header = TRUE
  )

depth_nodedup_bt <-
  read.table(
    "data_TE/TE_sequencing_experiment_readdepth_per_sample_and_virus_bowtie2_nodedup_atcc_ref.tsv",
    sep = "\t",
    header = TRUE
  )

Format data


depths_bt_all <- rbind(depth_dedup_bt, depth_nodedup_bt)

depths_reads <- left_join(depths_bt_all, metadata2, by = "Sample_id") |>
  mutate(genome_structure = case_when((
    virus == "Human_adenovirus_40" |
      virus == "Human_betaherpesvirus"
  ) ~ "DNA",
  ,
  .default = "RNA"
  ))

####linear model for spike in viruses - mean read depth####

depths_reads_sub <- depths_reads |>
  filter(Background != "p6") |>
  filter(Background != "control") |>
  filter(type == "dedup_TE") |>
  mutate(log_depth = log10(mean_depth)) |>
  mutate(log_viral_load = log10(Viral.load))

#inspect data
hist(depths_reads_sub$mean_depth)
hist(depths_reads_sub$Viral.load)
hist(depths_reads_sub$log_depth)
hist(depths_reads_sub$log_viral_load)

summary(depths_reads_sub$log_depth)
summary(depths_reads_sub$log_viral_load)

model

#all viruses together

lm1 <-
  glm(log_depth ~ log_viral_load * virus,
      data = depths_reads_sub,
      family = "gaussian")

summary(lm1)

glm.diag.plots(lm1)

pred <- predict(lm1, type = "response")

rsq <- function (x, y) {
  cor(x, y) ^ 2
}

rsq(depths_reads_sub$log_depth, pred)

##split by viruses

cols <- c("#4477AA",
          "#66CCEE",
          "#228833",
          "#CCBB44",
          "#EE6677",
          "#AA3377")

facet_names <- c(
  "Human_adenovirus_40" = "Human adenovirus 40",
  "Human_betaherpesvirus" = "Human betaherpesvirus",
  "Human_respiratory_syncytial_virus" = "Human respiratory syncytial virus",
  "Influenza_B_virus" = "Influenza B virus",
  "Mammalian_orthoreovirus3" = "Mammalian orthoreovirus 3",
  "Zika_virus" = "Zika virus"
)

virus <- (unique(depths_reads_sub$virus)) %>%
  rep(., each = 2) %>%
  data.frame()

list_models <- 
  depths_reads_sub |>
  group_split(virus) |>
  map( ~ lm(log_depth ~ log_viral_load, data = .))

lm_tidy <- 
  map(list_models, broom::tidy) %>%
  do.call(rbind.data.frame, .) %>%
  cbind(virus, .) %>%
  rename(virus = ".") %>%
  select(virus, term, estimate) %>%
  pivot_wider(names_from = "term", values_from = "estimate")

lm_summary <- depths_reads_sub |>
  group_by(virus) |>
  summarise(
    Intercept = lm(log_depth ~ log_viral_load)$coefficients[1],
    Coeff_x1 = lm(log_depth ~ log_viral_load)$coefficients[2],
    R2 = summary(lm(log_depth ~ log_viral_load))$r.squared,
    pvalue = summary(lm(log_depth ~ log_viral_load))$coefficients["log_viral_load", 4]
  )

lm_combined <- 
  left_join(lm_tidy, lm_summary, by = "virus")

Make plot

Final plot

#ggsave("figures/compare_spike_ins_atcc/model_readdepth_dedup.png")

ggsave("figures/manuscript_figure_2025/PDF/Figure_1.pdf",
       width = 8,
       height = 5)

ggsave("figures/manuscript_figure_2025/PNG/Figure_1.png",
       width = 8,
       height = 5)

——————–

Figure S1

Read-in data / Make plots

#metadata

metadata <-
  read.csv(
    "metadata/sampleIDs_TESpikeIn.csv",
    header = TRUE
  )

metadata2 <- metadata |>
  select(Sample.ID, Number.of.read.pairs..quality.adaptor.trimmed.) |>
  rename(Sample_id = Sample.ID) |>
  rename(QC_reads = Number.of.read.pairs..quality.adaptor.trimmed.)

#import and combine read count files
#deduplicated and non-deduplicated

counts_dedup_bt <-
  read.table(
    "data_TE/TE_sequencing_experiment_readcount_per_sample_and_virus_bowtie2_dedup_atcc_ref.tsv",
    sep = "\t",
    header = TRUE
  )

counts_nodedup_bt <-
  read.table(
    "data_TE/TE_sequencing_experiment_readcount_per_sample_and_virus_bowtie2_nodedup_atcc_ref.tsv",
    sep = "\t",
    header = TRUE
  )

counts_bt_all <- rbind(counts_dedup_bt, counts_nodedup_bt)

counts_reads <- left_join(counts_bt_all, metadata2, by = "Sample_id")

#normalise by both raw read count and genome length - should be the same as mean read depth

counts_reads_norm <- counts_reads |>
  mutate(norm_counts1 = matched / QC_reads) |>
  mutate(norm_counts2 = matched / QC_reads / length) |>
  mutate(norm_counts3 = matched / length) |>
  mutate(genome_structure = case_when((
    virus == "Human_adenovirus_40" |
      virus == "Human_betaherpesvirus"
  ) ~ "DNA",
  .default = "RNA"
  ))

####compare library capture pool####

cols3 <- c("#228833", "#AA3377")

facet_names <- c("dedup_TE" = "Deduplicated",
                 "nodedup_TE" = "Non-Deduplicated")

metadata3 <- 
  metadata |>
  select(Sample.ID, Pool.for.sequencing) |>
  rename(Sample_id = Sample.ID) |>
  rename(Pool = Pool.for.sequencing)

counts_pool <- left_join(counts_reads_norm, metadata3, by = "Sample_id")

Make plots


pool_counts_sum <- 
  counts_pool |>
  filter(Background != "p6") |>
  filter(Background != "control") |>
  ggplot(aes(
    x = as.character(Viral.load),
    y = log10(matched),
    colour = Pool
  )) +
  geom_boxplot() +
  facet_grid( ~ type, labeller = as_labeller(facet_names)) +
  theme_few() +
  theme(
    # axis.title.x = element_blank(),
    # axis.text.x = element_blank(),
    legend.title = element_blank(),
    axis.title.y = element_text(size = 10),
    panel.grid.major = element_line(linewidth=0.1, color = "gray60"),
    panel.grid.minor = element_line(linewidth=0.1, color = "lightgray")
  ) +
  ylab("Log (Viral Reads)") + xlab("Genome copies") +
  scale_color_manual(values = cols3, labels = c("Pool 1", "Pool 2")) + 
  scale_x_discrete(labels = c(bquote(10^{2}), bquote(10^{3}), bquote(10^{5})))
  

# pool_readcounts_norm <- 
#   counts_pool |>
#   filter(Background != "p6") |>
#   filter(Background != "control") |>
#   ggplot(aes(x = virus, y = log10(norm_counts1))) +
#   geom_boxplot() +
#   facet_grid(Pool ~ type) +
#   theme_few() +
#   theme(
#     axis.title.x = element_blank(),
#     # axis.text.x = element_blank(),
#     axis.title.y = element_text(size = 10),
#     panel.grid.major = element_line(linewidth=0.1, color = "gray60"),
#     panel.grid.minor = element_line(linewidth=0.1, color = "lightgray")
#   ) +
#   ylab("log10(viral reads/cleaned reads)")

pool_norm1_sum <- 
  counts_pool |>
  filter(Background != "p6") |>
  filter(Background != "control") |>
  ggplot(aes(
    x = as.character(Viral.load),
    y = log10(norm_counts1),
    colour = Pool
  )) +
  geom_boxplot() +
  facet_grid( ~ type, labeller = as_labeller(facet_names)) +
  theme_few() +
  theme(
    # axis.title.x = element_blank(),
    # axis.text.x = element_blank(),
    legend.title = element_blank(),
    axis.title.y = element_text(size = 10),
    panel.grid.major = element_line(linewidth=0.1, color = "gray60"),
    panel.grid.minor = element_line(linewidth=0.1, color = "lightgray") 
  ) +
  ylab("Log (viral reads/cleaned reads)") + xlab("Genome copies") +
  scale_color_manual(values = cols3, labels = c("Pool 1", "Pool 2")) + 
    scale_x_discrete(labels = c(bquote(10^{2}), bquote(10^{3}), bquote(10^{5})))

# pool_readcounts_norm2 <- 
#   counts_pool |>
#   filter(Background != "p6") |>
#   filter(Background != "control") |>
#   ggplot(aes(x = virus, y = log10(norm_counts2))) +
#   geom_boxplot() +
#   facet_grid(Pool ~ type) +
#   theme_few() +
#   theme(
#     axis.title.x = element_blank(),
#     # axis.text.x = element_blank(),
#     axis.title.y = element_text(size = 10),
#     panel.grid.major = element_line(linewidth=0.1, color = "gray60"),
#     panel.grid.minor = element_line(linewidth=0.1, color = "lightgray")
#   ) +
#   ylab("log10(viral reads/cleaned reads/genome length)")


pool_norm2_sum <- 
  counts_pool |>
  filter(Background != "p6") |>
  filter(Background != "control") |>
  ggplot(aes(
    x = as.character(Viral.load),
    y = norm_counts2,
    colour = Pool
  )) +
  geom_boxplot() +
  facet_grid( ~ type, labeller = as_labeller(facet_names)) +
  theme_few() +
  theme(
    # axis.title.x = element_blank(),
    # axis.text.x = element_blank(),
    legend.title = element_blank(),
    axis.title.y = element_text(size = 10),
    panel.grid.major = element_line(linewidth=0.1, color = "gray60"),
    panel.grid.minor = element_line(linewidth=0.1, color = "lightgray")
  ) +
  ylab("Log (viral reads/cleaned reads/genome length)") + xlab("Genome copies") +
  scale_color_manual(values = cols3, labels = c("Pool 1", "Pool 2")) + 
    scale_x_discrete(labels = c(bquote(10^{2}), bquote(10^{3}), bquote(10^{5})))


#import and combine read depth files

depth_dedup_bt <-
  read.table(
    "data_TE/TE_sequencing_experiment_readdepth_per_sample_and_virus_bowtie2_dedup_atcc_ref.tsv",
    sep = "\t",
    header = TRUE
  )

depth_nodedup_bt <-
  read.table(
    "data_TE/TE_sequencing_experiment_readdepth_per_sample_and_virus_bowtie2_nodedup_atcc_ref.tsv",
    sep = "\t",
    header = TRUE
  )

depths_bt_all <- rbind(depth_dedup_bt, depth_nodedup_bt)

depths_reads <- 
  left_join(depths_bt_all, metadata2, by = "Sample_id") |>
  mutate(genome_structure = case_when((
    virus == "Human_adenovirus_40" |
      virus == "Human_betaherpesvirus"
  ) ~ "DNA",
  .default = "RNA"
  ))

depths_pool <- left_join(depths_reads, metadata3, by = "Sample_id")

# pool_readdepths <- 
#   depths_pool |>
#   filter(Background != "p6") |>
#   filter(Background != "control") |>
#   ggplot(aes(x = virus, y = log10(mean_depth))) +
#   geom_boxplot() +
#   facet_grid(Pool ~ type) +
#   theme_few() +
#   theme(
#     axis.title.x = element_blank(),
#     axis.text.x = element_text(angle = 45, hjust = 1),
#     axis.title.y = element_text(size = 10),
#     panel.grid.major = element_line(linewidth=0.1, color = "gray60"),
#     panel.grid.minor = element_line(linewidth=0.1, color = "lightgray")
#   ) +
#   ylab("Log (mean read depth)")

pool_depths_sum <-
  depths_pool |>
  filter(Background != "p6") |>
  filter(Background != "control") |>
  ggplot(aes(
    x = as.character(Viral.load),
    y = log10(mean_depth),
    colour = Pool
  )) +
  geom_boxplot() +
  facet_grid( ~ type, labeller = as_labeller(facet_names)) +
  theme_few() +
  theme(
    # axis.title.x = element_blank(),
    # axis.text.x = element_blank(),
    legend.title = element_blank(),
    axis.title.y = element_text(size = 10),
    panel.grid.major = element_line(linewidth=0.1, color = "gray60"),
    panel.grid.minor = element_line(linewidth=0.1, color = "lightgray")
  ) +
  ylab("Log (mean read depth)") +
  xlab("Genome copies") + 
  scale_x_discrete(labels = c(bquote(10^{2}), bquote(10^{3}), bquote(10^{5}))) +
  scale_color_manual(values = cols3, labels = c("Pool 1", "Pool 2"))

Final plot

ggarrange(
  pool_counts_sum,
  pool_depths_sum,
  nrow = 1,
  ncol = 2,
  common.legend = TRUE, 
  align = "hv"
)


#ggsave("figures/compare_spike_ins_atcc/pool_compare_viruses_sum.png",width=10,height=7)

ggsave("figures/manuscript_figure_2025/PDF/Figure_S1.pdf",
       width = 12,
       height = 4)

ggsave("figures/manuscript_figure_2025/PNG/Figure_S1.png",
       width = 12,
       height = 4)

——————–

Figure S2

Read counts

### Read-in data

#ATCC genomes updated version
#October 2024

#viral load vs read count normalised using different methods, comparing deduplicated and non-deduplicated
#use bowtie2 data

####read counts/normalised read counts####

#metadata

metadata <- read.csv("metadata/sampleIDs_TESpikeIn.csv", header = TRUE)

metadata2 <- metadata |>
  select(Sample.ID, Number.of.read.pairs..quality.adaptor.trimmed.) |>
  rename(Sample_id = Sample.ID) |>
  rename(QC_reads = Number.of.read.pairs..quality.adaptor.trimmed.)

#import and combine read count files
#deduplicated and non-deduplicated

counts_dedup_bt <-
  read.table(
    "data_TE/TE_sequencing_experiment_readcount_per_sample_and_virus_bowtie2_dedup_atcc_ref.tsv",
    sep = "\t",
    header = TRUE
  )

counts_nodedup_bt <-
  read.table(
    "data_TE/TE_sequencing_experiment_readcount_per_sample_and_virus_bowtie2_nodedup_atcc_ref.tsv",
    sep = "\t",
    header = TRUE
  )

counts_bt_all <- rbind(counts_dedup_bt, counts_nodedup_bt)

counts_reads <- left_join(counts_bt_all, metadata2, by = "Sample_id")

#normalise by both raw read count and genome length - should be the same as mean read depth

counts_reads_norm <- counts_reads |>
  mutate(norm_counts1 = matched / QC_reads) |>
  mutate(norm_counts2 = matched / QC_reads / length) |>
  mutate(norm_counts3 = matched / length) |>
  mutate(genome_structure = case_when((
    virus == "Human_adenovirus_40" |
      virus == "Human_betaherpesvirus"
  ) ~ "DNA",
  .default = "RNA"
  ))

Read depths

depth_nodedup_bt <-
  read_tsv(
    "data_TE/TE_sequencing_experiment_readdepth_per_sample_and_virus_bowtie2_nodedup_atcc_ref.tsv"
  )
Rows: 95 Columns: 12── Column specification ───────────────────────────────────
Delimiter: "\t"
chr (5): virus, Sample_id, Background, mapper, type
dbl (7): Viral load, mean_depth, median_depth, min, max...
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

Format data

#change labels in facet plots

facet_names <- c("dedup_TE" = "Deduplicated",
                 "nodedup_TE" = "Non-Deduplicated")

cols <- c("#4477AA",
          "#66CCEE",
          "#228833",
          "#CCBB44",
          "#EE6677",
          "#AA3377")

Make plots

read_depths <- 
  depths_reads |>
  filter(Background != "p6") |>
  filter(Background != "control") |>
  ggplot(aes(
    x = Viral.load,
    y = log10(mean_depth),
    colour = virus
  )) +
  geom_point() +
  geom_smooth(aes(group = virus, linetype = genome_structure), se = FALSE) +
  facet_grid( ~ type, labeller = as_labeller(facet_names)) +
  theme_few() +
  theme(
    axis.title.x = element_text(size = 12),
    axis.text.x = element_text(angle = 0, hjust = 1),
    axis.title.y = element_text(size = 12),
    legend.title = element_blank(),
    legend.position = "top", 
    panel.grid.major = element_line(linewidth=0.1, color = "gray60"),
    panel.grid.minor = element_line(linewidth=0.1, color = "lightgray")
  ) +
  scale_color_manual(
    values = cols,
    labels = c(
      "Human adenovirus 40",
      "Human betaherpesvirus",
      "Human respiratory syncytial virus",
      "Influenza B virus",
      "Mammalian orthoreovirus 3",
      "Zika virus"
    )
  ) +
  scale_linetype_manual(values=c(2, 1)) +
  scale_linetype_manual(values=c("dotted", "solid")) + 
  ylab("Log (mean read depth)") +
  xlab("Genome copies") + 
  scale_x_log10(labels = scales::trans_format("log10", scales::label_math()))
Scale for linetype is already present.
Adding another scale for linetype, which will replace the
existing scale.

Final plot

——————–

Figure S3

reads_plot <- 
  reads_viral_all |>
  group_by(Background, Sample_id, Viral.load, type) |>
  summarise(
    total_reads = (QC_reads * 2),
    viral_reads = total_virus_reads,
    ATCC_reads = sum(matched),
    prop_ATCC = ATCC_reads / total_reads,
    prop_viral = total_virus_reads / total_reads,
    diff = viral_reads - ATCC_reads
  ) |>
  unique()
Warning: Returning more (or less) than 1 row per `summarise()` group was deprecated in dplyr 1.1.0.
Please use `reframe()` instead.
When switching from `summarise()` to `reframe()`, remember that `reframe()` always returns an ungrouped data frame and adjust accordingly.`summarise()` has grouped output by 'Background', 'Sample_id', 'Viral.load', 'type'. You can override using the `.groups` argument.

Make plots

Final plot

——————–

No longer required? Figure S4

Make plots

####read counts/depths split by background####

#change labels in facet plots

facet_names_bg <- c(
  "dedup_TE" = "Deduplicated",
  "nodedup_TE" = "Non-Deduplicated",
  "p2" = "M1",
  "p8" = "M2"
)

#break down by Background x virus

background_counts_reads <- 
  counts_reads_norm |>
  filter(Background != "p6") |>
  filter(Background != "control") |>
  ggplot(aes(
    x = Viral.load,
    y = log10(matched),
    colour = virus
  )) +
  geom_point() +
  geom_smooth(aes(group = virus, linetype = genome_structure), se = FALSE) +
  facet_grid(Background ~ type, labeller = as_labeller(facet_names_bg)) +
  theme_few() +
  theme(
    axis.title.x = element_blank(),
    axis.title.y = element_text(size = 10),
    panel.grid.major = element_line(linewidth=0.1, color = "gray60"),
    panel.grid.minor = element_line(linewidth=0.1, color = "lightgray")
  ) +
  scale_color_manual(
    values = cols,
    labels =
      c(
      "Human adenovirus 40",
      "Human betaherpesvirus",
      "Human respiratory syncytial virus",
      "Influenza B virus",
      "Mammalian orthoreovirus 3",
      "Zika virus"
      )
  ) +
  scale_x_log10(labels = scales::label_log10()) +
  ylab("Log (Viral Reads)")

backgrounds_counts_reads_norm <- 
  counts_reads_norm |>
  filter(Background != "p6") |>
  filter(Background != "control") |>
  ggplot(aes(
    x = Viral.load,
    y = log10(norm_counts1),
    colour = virus
  )) +
  geom_point() +
  geom_smooth(aes(group = virus, linetype = genome_structure), se = FALSE) +
  facet_grid(Background ~ type, labeller = as_labeller(facet_names_bg)) +
  theme_bw() +
  theme_few() +
  theme(
    axis.title.x = element_blank(),
    axis.title.y = element_text(size = 10),
    panel.grid.major = element_line(linewidth=0.1, color = "gray60"),
    panel.grid.minor = element_line(linewidth=0.1, color = "lightgray")
  ) +
  scale_color_manual(
    values = cols,
    labels = c(
      "Human adenovirus 40",
      "Human betaherpesvirus",
      "Human respiratory syncytial virus",
      "Influenza B virus",
      "Mammalian orthoreovirus 3",
      "Zika virus"
    )
  ) +
  scale_x_log10(label = scales::label_log10()) +
  ylab("Log (viral reads/cleaned reads)")

backgrounds_counts_reads_norm2 <- 
  counts_reads_norm |>
  filter(Background != "p6") |>
  filter(Background != "control") |>
  ggplot(aes(
    x = Viral.load,
    y = log10(norm_counts2),
    colour = virus
  )) +
  geom_point() +
  geom_smooth(aes(group = virus, linetype = genome_structure), se = FALSE) +
  facet_grid(Background ~ type, labeller = as_labeller(facet_names_bg)) +
  theme_few() +
  theme(
    axis.title.x = element_blank(),
    axis.title.y = element_text(size = 10),
    panel.grid.major = element_line(linewidth=0.1, color = "gray60"),
    panel.grid.minor = element_line(linewidth=0.1, color = "lightgray")
  ) +
  scale_color_manual(
    values = cols,
    labels = c(
      "Human adenovirus 40",
      "Human betaherpesvirus",
      "Human respiratory syncytial virus",
      "Influenza B virus",
      "Mammalian orthoreovirus 3",
      "Zika virus"
    )
  ) +
  scale_x_log10(labels = scales::label_log10()) +
  ylab("Log (viral reads/cleaned reads/genome length)")

background_read_depths <- 
  depths_reads |>
  filter(Background != "p6") |>
  filter(Background != "control") |>
  ggplot(aes(
    x = Viral.load,
    y = log10(mean_depth),
    colour = virus
  )) +
  geom_point() +
  geom_smooth(aes(group = virus, linetype = genome_structure), se = FALSE) +
  facet_grid(Background ~ type, labeller = as_labeller(facet_names_bg)) +
  theme_few() +
  theme(
    axis.title.x = element_blank(),
    panel.grid.major = element_line(linewidth=0.1, color = "gray60"),
    panel.grid.minor = element_line(linewidth=0.1, color = "lightgray"),
    axis.title.y = element_text(size = 10),
    legend.title = element_blank()
  ) +
  scale_color_manual(
    values = cols,
    labels = c(
      "Human adenovirus 40",
      "Human betaherpesvirus",
      "Human respiratory syncytial virus",
      "Influenza B virus",
      "Mammalian orthoreovirus 3",
      "Zika virus"
    )
  ) +
  scale_x_log10(label = scales::label_log10()) +
  ylab("Log (mean read depth)")

Final plot

——————–

——————–

>>> Figure 2: plots of TE experiment data - genome coverage & per site coverage

Read-in data

counts_nodedup_bt <-
  read_tsv(
    "data_TE/TE_sequencing_experiment_readcount_per_sample_and_virus_bowtie2_nodedup_atcc_ref.tsv"
  )
Rows: 96 Columns: 9── Column specification ───────────────────────────────────
Delimiter: "\t"
chr (5): virus, Sample_id, Background, mapper, type
dbl (4): length, matched, unmatched, Viral load
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

Format data

persite_coverage_nodedup <- 
  nodedup_per_site |>
  rename(Viral.load = `Viral load`) |>
  group_by(Sample_id, virus, Viral.load, Background, type) |>
  filter(coverage > 0) |>
  summarise(genome_coverage = n()) |>
  left_join(lengths) |>
  mutate(percent_coverage = genome_coverage / length)
`summarise()` has grouped output by 'Sample_id', 'virus', 'Viral.load', 'Background'. You can override using the `.groups` argument.Joining with `by = join_by(virus)`

Make plots

Final plot

——————–

——————–

Figure S4

Final plot

ggsave("figures/manuscript_figure_2025/PDF/Figure_S4.pdf",
       width=15,
       height=10)

ggsave("figures/manuscript_figure_2025/PNG/Figure_S4.png",
       width=15,
       height=10)

——————–

Figure S5

####Final plot


ggsave("figures/manuscript_figure_2025/PDF/Figure_S5.pdf",
       width=15,
       height=10)

ggsave("figures/manuscript_figure_2025/PNG/Figure_S5.png",
       width=15,
       height=10)

——————–

Figure S6

####Final plot


ggsave("figures/manuscript_figure_2025/PDF/Figure_S6.pdf",
       width=15,
       height=10)

ggsave("figures/manuscript_figure_2025/PNG/Figure_S6.png",
       width=15,
       height=10)

——————–

Figure S7

HBV <- 
persite_norm |>
  filter(Background != "p6") |>
  filter(Background != "control") |>
  filter(type == "dedup_TE") |>
  filter(virus == "Human_betaherpesvirus") |>
  rename(Viral.load = `Viral load`) |>
  ggplot(aes(x = site, y = coverage, fill = Background)) +
  geom_col() +
  facet_wrap(Viral.load ~ Sample_id, label = label_parsed) +
  ylab("Coverage") +
  ggtitle("Human Betaherpesvirus (deduplicated)") +
  guides(fill = guide_legend(title = "Background")) +
 theme_few() +
  theme(
    axis.title.x = element_text(size = 12),
    axis.text.x = element_text(angle = 30, hjust = 1),
    axis.title.y = element_text(size = 12),
    legend.title = element_blank(),
    legend.position = "top", 
    panel.grid.major = element_line(linewidth=0.1, color = "gray60"),
    panel.grid.minor = element_line(linewidth=0.1, color = "lightgray")
  ) +
  scale_y_log10() +
  scale_fill_manual(values = cols2, labels = c("M1", "M2"))

####Final plot


ggsave(plot = HBV, file="figures/manuscript_figure_2025/PDF/Figure_S7.pdf",
       width=15,
       height=10,
       dpi = 600)

ggsave(plot = HBV, file="figures/manuscript_figure_2025/PNG/Figure_S7.png",
       width=15,
       height=10)

——————–

Figure S8

ggarrange(FLU_ME1, FLU_ME2, ncol = 2, common.legend = TRUE)
Warning: log-10 transformation introduced infinite values.Warning: Removed 3867 rows containing missing values or values
outside the scale range (`geom_col()`).Warning: log-10 transformation introduced infinite values.Warning: Removed 3867 rows containing missing values or values
outside the scale range (`geom_col()`).Warning: log-10 transformation introduced infinite values.Warning: Removed 996 rows containing missing values or values
outside the scale range (`geom_col()`).

####Final plot


ggsave("figures/manuscript_figure_2025/PDF/Figure_S8.pdf",
       width=20,
       height=12,
       dpi=600)

ggsave("figures/manuscript_figure_2025/PNG/Figure_S8.png",
       width=20,
       height=12)

——————–

Figure S9


REO_ME1 <- 
  persite_norm |>
  filter(virus == "Mammalian_orthoreovirus3") |>
  filter(type == "dedup_TE") |>
  filter(Background == "p2") |>
  rename(Viral.load = `Viral load`) |>
  ggplot(aes(
    x = site,
    y = coverage,
    fill = as.character(Viral.load)
  )) +
  geom_col() +
  facet_grid(seg ~ Sample_id) +
  ylab("Coverage") +
  ggtitle("Mammalian orthoreovirus 3 (Background M1 deduplicated)") +
  guides(fill = guide_legend(title = "Viral load")) +
  theme_few() +
  theme(
    axis.title.x = element_text(size = 12),
    axis.text.x = element_text(angle = 30, hjust = 1),
    axis.title.y = element_text(size = 12),
    legend.title = element_blank(),
    legend.position = "top",
    panel.grid.major = element_line(linewidth = 0.1, color = "gray60"),
    panel.grid.minor = element_line(linewidth = 0.1, color = "lightgray")
  ) +
  scale_y_log10() +
  scale_fill_manual(values = cols4,
                    label = c(
                      expression("10" ^ "2"),
                      expression("10" ^ "3"),
                      expression("10" ^ "5")
                    ))

REO_ME2 <- 
  persite_norm |>
  filter(virus == "Mammalian_orthoreovirus3") |>
  filter(type == "dedup_TE") |>
  filter(Background == "p8") |>
  rename(Viral.load = `Viral load`) |>
  ggplot(aes(
    x = site,
    y = coverage,
    fill = as.character(Viral.load)
  )) +
  geom_col() +
  facet_grid(seg ~ Sample_id) +
  ylab("Coverage") +
  ggtitle("Mammalian orthoreovirus 3 (Background M2 deduplicated)") +
  guides(fill = guide_legend(title = "Viral load")) +
  theme_few() +
  theme(
    axis.title.x = element_text(size = 12),
    axis.text.x = element_text(angle = 30, hjust = 1),
    axis.title.y = element_text(size = 12),
    legend.title = element_blank(),
    legend.position = "top",
    panel.grid.major = element_line(linewidth = 0.1, color = "gray60"),
    panel.grid.minor = element_line(linewidth = 0.1, color = "lightgray")
  ) + scale_y_log10() +
  scale_fill_manual(values = cols4,
                    label = c(
                      expression("10" ^ "2"),
                      expression("10" ^ "3"),
                      expression("10" ^ "5")
                    ))

ggarrange(REO_ME1, REO_ME2, ncol = 2, common.legend = TRUE)
Warning: log-10 transformation introduced infinite values.Warning: Removed 2181 rows containing missing values or values
outside the scale range (`geom_col()`).Warning: log-10 transformation introduced infinite values.Warning: Removed 2181 rows containing missing values or values
outside the scale range (`geom_col()`).Warning: log-10 transformation introduced infinite values.Warning: Removed 934 rows containing missing values or values
outside the scale range (`geom_col()`).

####Final plot

ggsave("figures/manuscript_figure_2025/PDF/Figure_S9.pdf",
       width=20,
       height=12,
       dpi=600)

ggsave("figures/manuscript_figure_2025/PNG/Figure_S9.png",
       width=20,
       height=12,
       dpi=600)

——————–

>>>> Figure 3: all viruses - target and off target

Read-in data


# read in metadata

metadata <-
  read.csv("metadata/sampleIDs_TESpikeIn.csv", header = TRUE)

metadata2 <-
  metadata |>
  select(
    Sample.ID,
    Background.sample,
    Viral.load,
    Number.of.read.pairs..quality.adaptor.trimmed.
  ) |>
  rename(Sample_id = Sample.ID) |>
  rename(QC_reads = Number.of.read.pairs..quality.adaptor.trimmed.)

#import viral reads mapped (to calculate proportions)

viral_reads_dedup <-
  read.csv(
    "data_TE/total_virus_mapped_reads_per_sample_dedup_atcc_ref_20241108.csv",
    header = TRUE
  )

viral_reads_nodedup <-
  read.csv(
    "data_TE/total_virus_mapped_reads_per_sample_nodedup_atcc_ref_20241108.csv",
    header = TRUE
  )

#import contingency tables

contingency_dedup <-
  read.csv(
    "data_TE/contingency_table_mapped_virus_reads_per_family_genus_sample_dedup_atcc_ref_20241106.csv",
    header = TRUE
  )

contingency_nodedup <-
  read.csv(
    "data_TE/contingency_table_mapped_virus_reads_per_family_genus_sample_nodedup_atcc_ref_20241106.csv",
    header = TRUE
  )

contaminants <-
  c("Betacoronavirus", "Alphainfluenzavirus", "Gammaretrovirus")

Format data


# pivot longer  
dedup_long <-
  contingency_dedup |>
  filter(!genus %in% contaminants) |>
  filter(genus != "NA") |>
  pivot_longer(cols = A:P,
               names_to = "Sample_id",
               values_to = "count") |>
  mutate(
    target = case_when(
      genus == "Cyclovirus" ~ "off_target",
      genus == "Cardiovirus" ~ "off_target",
      genus == "Kobuvirus" ~ "off_target",
      .default = "on_target"
    )
  ) |>
  mutate(genome_structure = case_when((genus == "Mastadenovirus" |
                                         genus == "Cytomegalovirus") ~ "DNA",
                                      .default = "RNA"
  ))

no_dedup_long <- contingency_nodedup |>
  filter(!genus %in% contaminants) |>
  filter(genus != "NA") |>
  pivot_longer(cols = A:P,
               names_to = "Sample_id",
               values_to = "count") |>
  mutate(
    target = case_when(
      genus == "Cyclovirus" ~ "off_target",
      genus == "Cardiovirus" ~ "off_target",
      genus == "Kobuvirus" ~ "off_target",
      .default = "on_target"
    )
  ) |>
  mutate(genome_structure = case_when((genus == "Mastadenovirus" |
                                         genus == "Cytomegalovirus") ~ "DNA",
                                      .default = "RNA"
  ))

metadata_dedup <- left_join(dedup_long, metadata2, by = "Sample_id")

dedup_viral_reads <-
  left_join(metadata_dedup, viral_reads_dedup, by = "Sample_id") |>
  mutate(total_reads = QC_reads * 2) |>
  mutate(prop_total = count / total_reads) |>
  mutate(prop_viral = count / total_virus_reads)

metadata_no_dedup <- left_join(no_dedup_long, metadata2, by = "Sample_id")

nodedup_viral_reads <-
  left_join(metadata_no_dedup, viral_reads_nodedup, by = "Sample_id") |>
  mutate(total_reads = QC_reads * 2) |>
  mutate(prop_total = count / total_reads) |>
  mutate(prop_viral = count / total_virus_reads)

Make plots

Dedup – viral reads, prop all reads, and prop viral reads

Final plot

…………………….

…………………….

—-Genome medicine results—-

Figure S10

Read-in data



####read counts/normalised read counts####

#import and combine read count files

gm_readcount_dedup <-
  read.table(
    "data_genome_medicine/genome_medicine_TE_sequencing_experiment_readcount_per_sample_and_virus_bowtie2_dedup_atcc_ref.tsv",
    sep = "\t",
    header = TRUE
  )

gm_readcount_nodedup <-
  read.table(
    "data_genome_medicine/genome_medicine_TE_sequencing_experiment_readcount_per_sample_and_virus_bowtie2_nodedup_atcc_ref.tsv",
    sep = "\t",
    header = TRUE
  )

gm_counts_all <- rbind(gm_readcount_dedup, gm_readcount_nodedup)

read counts

read depth

#import and combine read depth files

depth_dedup_gm <-
  read.table(
    "data_genome_medicine/genome_medicine_TE_sequencing_experiment_readdepth_per_sample_and_virus_bowtie2_dedup_atcc_ref.tsv",
    sep = "\t",
    header = TRUE
  )

depth_nodedup_gm <-
  read.table(
    "data_genome_medicine/genome_medicine_TE_sequencing_experiment_readdepth_per_sample_and_virus_bowtie2_nodedup_atcc_ref.tsv",
    sep = "\t",
    header = TRUE
  )

depths_gm_all <- rbind(depth_dedup_gm, depth_nodedup_gm)

——————–

Figure S11

Read-in data

####genome coverage####

unzip(zipfile = "data_genome_medicine/genome_medicine_TE_sequencing_experiment_readdepth_per_site_sample_and_virus_bowtie2_dedup_atcc_ref.tsv.zip", exdir = "data_genome_medicine/")

dedup_per_site <-
  read.table(
    "data_genome_medicine/genome_medicine_TE_sequencing_experiment_readdepth_per_site_sample_and_virus_bowtie2_dedup_atcc_ref.tsv",
    sep = "\t",
    header = TRUE
  )

file.remove("data_genome_medicine/genome_medicine_TE_sequencing_experiment_readdepth_per_site_sample_and_virus_bowtie2_dedup_atcc_ref.tsv")
[1] TRUE
#View(dedup_per_site)

counts_dedup_bt <-
  read.table(
    "data_TE/TE_sequencing_experiment_readcount_per_sample_and_virus_bowtie2_dedup_atcc_ref.tsv",
    sep = "\t",
    header = TRUE
  )

#add length column from read depth file to per site file

lengths <- 
  counts_dedup_bt |>
  select(virus,length) |>
  distinct()

#calculate genome coverage

coverage <- 
  dedup_per_site |>
  group_by(sample_id, virus, Viral.load, Replicate) |>
  filter(coverage > 0) |>
  summarise(genome_coverage = n()) |>
  left_join(lengths) |>
  mutate(percent_coverage = genome_coverage / length)
`summarise()` has grouped output by 'sample_id', 'virus', 'Viral.load'. You can override using the `.groups` argument.Joining with `by = join_by(virus)`
coverage |>
  filter(Viral.load != 0) |>
  ggplot(aes(x = virus, y = percent_coverage)) +
  geom_boxplot(outlier.shape = NA) +
  geom_point(position = position_dodge(width = .75)) +
  facet_grid( ~ as.character(Viral.load)) +
  theme_bw() +
  theme(axis.title.y = element_text(size = 10)) +
  ylab("Genome coverage") +
  theme_few() +
  theme(
    axis.title.x = element_blank(),
    axis.text.x = element_text(angle = 45, hjust = 1),
    axis.title.y = element_text(size = 15),
    axis.text = element_text(size = 12),
    strip.text = element_text(size = 12),
    legend.title = element_blank(),
    legend.position = "top",
    panel.grid.major = element_line(linewidth = 0.1, color = "gray60"),
    panel.grid.minor = element_line(linewidth = 0.1, color = "lightgray")
  ) +
  scale_y_continuous(limits = c(0, 1)) +
    scale_x_discrete(
    labels = c(
      "Human adenovirus 40",
      "Human betaherpesvirus",
      "Human respiratory syncytial virus",
      "Influenza B virus",
      "Mammalian orthoreovirus 3",
      "Zika virus"
    )
  )


#ggsave("/Users/laura/Dropbox/glasgow/github/te_ug_rodents/figures/genome_medicine/all_genome_coverage.png",width=14,height=6)

#ggsave("/Users/laura/Dropbox/glasgow/github/te_ug_rodents/figures/manuscript_figures_pdf/FigureS12.pdf",width=20,height=6)

*** Final plot ***

#PDF
ggsave(
  "figures/manuscript_figure_2025/PDF/Figure_S11.pdf",
  width = 14,
  height = 6
)

#PNG
ggsave(
  "figures/manuscript_figure_2025/PNG/Figure_S11.png",
  width = 14,
  height = 6
)

——————–

Figure S12

*** Final plot ***

ggsave("figures/manuscript_figure_2025/PDF/Figure_S12.pdf",
       width=12,
       height=10)

ggsave("figures/manuscript_figure_2025/PNG/Figure_S12.png",
       width=12,
       height=10)

…………………….

—-Kobuvirus / Cardiovirus results—-

Figure S13


kobu_depth <-
  read.table(
    "data_kobuvirus/kobuvirus_TE_polyomics_readdepth_20241007.tsv",
    sep = "\t",
    header = TRUE
  ) %>%
  select(!expt) %>%
  select(!Number.of.paired.end.reads..QT.)

cardio_depth <-
  read.table(
    "data_cardiovirus/cardiovirus_denovo_bowtie2_read_depth_per_sample_dedup.tsv",
    sep = ",",
    header = TRUE
  )

cardio_depth_poly <- 
  read.table(
    "data_cardiovirus/cardiovirus_denovo_bowtie2_readdepth_per_sample_dedup_polyomics.tsv",
    sep = ",",
    header = TRUE
  ) |>
  select(-contig_name) |>
  filter(Background %in% c("S18", "S19", "S43")) |>
  select(-Background) |>
  separate(Sample_id, into = c(NA, NA, "Background"), sep = "-", remove = FALSE)

depth_both <- rbind(kobu_depth, cardio_depth, cardio_depth_poly) |>
  mutate(Viral.load = ifelse(is.na(Viral.load), 0, Viral.load))

depth_both$type <- depth_both$type |>
  case_match("dedup_TE" ~ "dedup",
             "dedup_polyomics" ~ "dedup",
             .default = depth_both$type)

depth_both$Sample_id <- depth_both$Sample_id |>
  case_match("RNA-Msp-p2" ~ "M1",
             "RNA-Msp-p8" ~ "M2",
             "RNA-Msp-p6" ~ "M3",
             .default = depth_both$Sample_id
             )

depth_both$virus <- depth_both$virus |>
  case_match("cardiovirus" ~ "Cardiovirus",
             "k97_19971" ~ "Kobuvirus",
             .default = depth_both$virus
             )

cols2 <- c("#BB5566", "#004488")

coverage_labels <- c(
  "k97_19971" = "Kobuvirus",
  "cardiovirus" = "Cardiovirus",
  "p2" = "M1",
  "p8" = "M2"
)

*** Final plot ***


depth_both$Viral.load <- factor(depth_both$Viral.load, labels = c("Shotgun",
    "10^{2}", 
    "10^{3}", 
    "10^{5}"
    ))


# depth_both |>
#   filter(Background != "p6") |>
#   filter(type == "dedup") |>
#   filter(mapper == "bowtie2") |>
#   filter(Viral.load != "NA") |>
#   ggplot(aes(
#     x = Viral.load,
#     y = (mean_depth),
#     colour = Background
#   )) +
#   geom_point() +
#  theme_few() +
#   theme(
#     axis.title.x = element_text(size = 12),
#     axis.title.y = element_text(size = 12),
#     legend.title = element_blank(),
#     legend.position = "none", 
#     panel.grid.major = element_line(linewidth=0.1, color = "gray60"),
#     panel.grid.minor = element_line(linewidth=0.1, color = "lightgray")
#   ) +  
#   facet_grid(Background ~ virus, labeller = as_labeller(coverage_labels)) +
#   scale_color_manual(values = cols2) +
#   ylab("Mean read depth") +
#     xlab("Spike-in viral load (gc/ml)") +
#    scale_x_log10(labels = scales::trans_format("log10",
#                                         scales::label_math())) + 
#   scale_y_log10(labels = scales::trans_format("log10",
#                                         scales::label_math()))

# replotting S13 in similar way to S14
depth_both %>%
  filter(Background != "p6") |>
  filter(type == "dedup") |>
  filter(mapper == "bowtie2") |>
  # filter(Viral.load != "NA") |>
  ggplot(aes(x = Sample_id, y = mean_depth, color = Background)) +
  geom_point() +
  ylab("Mean read depth") +
  xlab("Sample ID") +
  facet_grid(virus ~ Viral.load,
             scales = "free_x",
             label = label_parsed) +
  scale_color_manual(values = cols2, labels = c("M1", "M2")) +
 theme_few() +
  theme(
    axis.title.x = element_text(size = 12),
    axis.title.y = element_text(size = 12),
    legend.title = element_blank(),
    legend.position = "right", 
    panel.grid.major = element_line(linewidth=0.1, color = "gray60"),
    panel.grid.minor = element_line(linewidth=0.1, color = "lightgray")
  )+
  scale_y_log10(labels = scales::trans_format("log10",
                                        scales::label_math()))


ggsave("figures/manuscript_figure_2025/PDF/Figure_S13_v2.pdf",
       width=8,
       height=4)

ggsave("figures/manuscript_figure_2025/PNG/Figure_S13_v2.png",
       width=8,
       height=4)

——————–

Figure S14


kobu_per_site <-
  read.table(
    "data_kobuvirus/kobuvirus_TE_polyomics_readdepth_persite_20241007.tsv",
    sep = "\t",
    header = TRUE
  ) %>%
  select(virus,
         seg,
         site,
         coverage,
         n_sites,
         Sample_id,
         Background,
         Viral.load,
         mapper,
         type)

kobu_per_site$Viral.load <- 
  kobu_per_site$Viral.load %>%
  replace_na(., 0)

kobu_per_site$Sample_id <- kobu_per_site$Sample_id %>%
  case_match("RNA-Msp-p2" ~ "M1",
             "RNA-Msp-p8" ~ "M2",
             .default = kobu_per_site$Sample_id)

kobu_per_site$Background <- kobu_per_site$Background %>%
  case_match("p2" ~ "M1",
             "p8" ~ "M2",
             .default = kobu_per_site$Background)

cardio_per_site <-
  read.table(
    "data_cardiovirus/cardiovirus_denovo_bowtie2_read_depth_per_site_and_sample_dedup.tsv",
    sep = ",",
    header = TRUE
  )

cardio_polyomics_per_site <-
  read.table(
    "data_cardiovirus/cardiovirus_denovo_bowtie2_readdepth_per_site_sample_dedup_polyomics.tsv",
    sep = ",",
    header = TRUE
  ) %>%
  select(!contig_name)

#View(cardio_polyomics_per_site)

cardio_TE_polyomics <-
  full_join(
    cardio_per_site,
    cardio_polyomics_per_site,
    by = c(
      "virus",
      "seg",
      "site",
      "coverage",
      "n_sites",
      "Sample_id",
      "Background",
      "Viral.load",
      "mapper",
      "type"
    )
  )

cardio_TE_polyomics$Viral.load <- cardio_TE_polyomics$Viral.load %>%
  replace_na(., 0)

drops <-
  c(
    "RNA-Msp-p1",
    "RNA-Msp-p3",
    "RNA-Msp-p4",
    "RNA-Msp-p5",
    "RNA-Msp-p6",
    "RNA-Msp-p7",
    "RNA-Msp-p9"
  )

cardio_TE_polyomics <- cardio_TE_polyomics %>%
  filter(!Sample_id %in% drops)

cardio_TE_polyomics$Sample_id <- cardio_TE_polyomics$Sample_id %>%
  case_match("RNA-Msp-p2" ~ "M1",
             "RNA-Msp-p8" ~ "M2",
             .default = cardio_TE_polyomics$Sample_id)

cardio_TE_polyomics$Background <- cardio_TE_polyomics$Background %>%
  case_match("S18" ~ "M1",
             "S43" ~ "M2",
             "p2" ~ "M1",
             "p8" ~ "M2",
             .default = cardio_TE_polyomics$Background)

persite_both <- rbind(kobu_per_site, cardio_TE_polyomics)

persite_both$type <- persite_both$type %>%
  case_match("dedup_TE" ~ "dedup",
             "dedup_polyomics" ~ "dedup",
             .default = persite_both$type)

#TE data only - compare coverage between backgrounds/viral loads

cols2 <- c("#BB5566", "#004488")

coverage_labels <- c(
  "100" = "10^{2}",
  "1000" = "10^{3}",
  "100000" = "10^{5}",
  "0" = "Shotgun",
  "Kobuvirus" = "Kobuvirus",
  "Cardiovirus" = "Cardiovirus"
)

coverage_labels2 <- c(
  "100" = "10^{2}",
  "1000" = "10^{3}",
  "100000" = "10^{5}",
  "0" = "Shotgun",
  "A" = "A",
  "B" = "B",
  "C" = "C",
  "D" = "D",
  "F" = "F",
  "G" = "G",
  "H" = "H",
  "I" = "I",
  "J" = "J",
  "K" = "K",
  "L" = "L",
  "M" = "M",
  "N" = "N",
  "O" = "O",
  "P" = "P",
  "ME_P1" = "M1",
  "ME_P2" = "M2"
)

kobu_coverage <- kobu_per_site %>%
  filter(mapper == "bowtie2") %>%
  filter(type == "dedup") %>%
  filter(Background != "p6") %>%
  group_by(Sample_id, Viral.load, Background) %>%
  filter(coverage > 0) %>%
  summarise(genome_coverage = n()) %>%
  mutate(percent_coverage = genome_coverage / 8467) %>%
  mutate(virus = "Kobuvirus")
`summarise()` has grouped output by 'Sample_id', 'Viral.load'. You can override using the `.groups` argument.
cardio_coverage <- cardio_TE_polyomics %>%
  filter(Background != "p6") %>%
  group_by(Sample_id, Viral.load, Background) %>%
  filter(coverage > 0) %>%
  summarise(genome_coverage = n()) %>%
  mutate(percent_coverage = genome_coverage / 6945) %>%
  mutate(virus = "Cardiovirus")
`summarise()` has grouped output by 'Sample_id', 'Viral.load'. You can override using the `.groups` argument.
coverage_both <- rbind(kobu_coverage, cardio_coverage)

coverage_both$Viral.load <- factor(coverage_both$Viral.load, labels = c("Shotgun",
    "10^{2}", 
    "10^{3}", 
    "10^{5}"
    ))

*** Final plot ***


coverage_both %>%
  ggplot(aes(x = Sample_id, y = percent_coverage, color = Background)) +
  geom_point() +
  ylab("Genome coverage") +
  xlab("Sample ID") +
  facet_grid(virus ~ (Viral.load),
             scales = "free_x",
             label = label_parsed) +
  scale_color_manual(values = cols2, labels = c("M1", "M2")) +
 theme_few() +
  theme(
    axis.title.x = element_text(size = 12),
    axis.title.y = element_text(size = 12),
    legend.title = element_blank(),
    legend.position = "right", 
    panel.grid.major = element_line(linewidth=0.1, color = "gray60"),
    panel.grid.minor = element_line(linewidth=0.1, color = "lightgray")
  ) + 
  scale_y_continuous(limits = c(0, 1))

#PDF
ggsave("figures/manuscript_figure_2025/PDF/Figure_S14.pdf", 
       width = 8, 
       height = 4)

#PNG
ggsave("figures/manuscript_figure_2025/PNG/Figure_S14.png", 
       width = 8, 
       height = 4)

——————–

Figure S15

*** Final plot ***

#PDF
ggsave("figures/manuscript_figure_2025/PDF/Figure_S15.pdf",
       width=12,
       height=7)

#PNG
ggsave("figures/manuscript_figure_2025/Png/Figure_S15.png",
       width=12,
       height=7)

Figure S16

*** Final plot ***


#PDF
ggsave("figures/manuscript_figure_2025/PDF/Figure_S16.pdf",
       width=12,
       height=7)


#PNG
ggsave("figures/manuscript_figure_2025/PNG/Figure_S16.png",
       width=12,
       height=7)
---
title: "target_offtarget"
output: html_notebook
editor_options: 
  chunk_output_type: inline
---

This is an [R Markdown](http://rmarkdown.rstudio.com) Notebook. When you execute code within the notebook, the results appear beneath the code. 

Try executing this chunk by clicking the *Run* button within the chunk or by placing your cursor inside it and pressing *Cmd+Shift+Enter*. 

```{r}
##all viruses - target and off target

library(tidyverse)
library(ggpubr)
library(tidyplots)
library(ggthemes)
library(scales)

```

### Figure 1 - linear model

#### Read-in data
```{r}
#Linear model for TE data - separated by virus
#November 2024

#viral read depth
#use bowtie2 data

library(tidyverse)
library(broom)
library(boot)

####read counts/normalised read counts####

#metadata

metadata <-
  read.csv(
    "metadata/sampleIDs_TESpikeIn.csv",
    header = TRUE
  )

metadata2 <- metadata |>
  select(Sample.ID, Number.of.read.pairs..quality.adaptor.trimmed.) |>
  rename(Sample_id = Sample.ID) |>
  rename(QC_reads = Number.of.read.pairs..quality.adaptor.trimmed.)

#import and combine read count files
#deduplicated and non-deduplicated

counts_dedup_bt <-
  read.table(
    "data_TE/TE_sequencing_experiment_readcount_per_sample_and_virus_bowtie2_dedup_atcc_ref.tsv",
    sep = "\t",
    header = TRUE
  )

counts_nodedup_bt <-
  read.table(
    "data_TE/TE_sequencing_experiment_readcount_per_sample_and_virus_bowtie2_nodedup_atcc_ref.tsv",
    sep = "\t",
    header = TRUE
  )

counts_bt_all <- rbind(counts_dedup_bt, counts_nodedup_bt)

counts_reads <- left_join(counts_bt_all, metadata2, by = "Sample_id")

####read depths####

#import and combine read depth files

depth_dedup_bt <-
  read.table(
    "data_TE/TE_sequencing_experiment_readdepth_per_sample_and_virus_bowtie2_dedup_atcc_ref.tsv",
    sep = "\t",
    header = TRUE
  )

depth_nodedup_bt <-
  read.table(
    "data_TE/TE_sequencing_experiment_readdepth_per_sample_and_virus_bowtie2_nodedup_atcc_ref.tsv",
    sep = "\t",
    header = TRUE
  )

```

#### Format data
```{r}

depths_bt_all <- rbind(depth_dedup_bt, depth_nodedup_bt)

depths_reads <- left_join(depths_bt_all, metadata2, by = "Sample_id") |>
  mutate(genome_structure = case_when((
    virus == "Human_adenovirus_40" |
      virus == "Human_betaherpesvirus"
  ) ~ "DNA",
  ,
  .default = "RNA"
  ))

####linear model for spike in viruses - mean read depth####

depths_reads_sub <- depths_reads |>
  filter(Background != "p6") |>
  filter(Background != "control") |>
  filter(type == "dedup_TE") |>
  mutate(log_depth = log10(mean_depth)) |>
  mutate(log_viral_load = log10(Viral.load))

#inspect data
hist(depths_reads_sub$mean_depth)
hist(depths_reads_sub$Viral.load)
hist(depths_reads_sub$log_depth)
hist(depths_reads_sub$log_viral_load)

summary(depths_reads_sub$log_depth)
summary(depths_reads_sub$log_viral_load)

```

#### model
```{r}
#all viruses together

lm1 <-
  glm(log_depth ~ log_viral_load * virus,
      data = depths_reads_sub,
      family = "gaussian")

summary(lm1)

glm.diag.plots(lm1)

pred <- predict(lm1, type = "response")

rsq <- function (x, y) {
  cor(x, y) ^ 2
}

rsq(depths_reads_sub$log_depth, pred)

##split by viruses

cols <- c("#4477AA",
          "#66CCEE",
          "#228833",
          "#CCBB44",
          "#EE6677",
          "#AA3377")

facet_names <- c(
  "Human_adenovirus_40" = "Human adenovirus 40",
  "Human_betaherpesvirus" = "Human betaherpesvirus",
  "Human_respiratory_syncytial_virus" = "Human respiratory syncytial virus",
  "Influenza_B_virus" = "Influenza B virus",
  "Mammalian_orthoreovirus3" = "Mammalian orthoreovirus 3",
  "Zika_virus" = "Zika virus"
)

virus <- (unique(depths_reads_sub$virus)) %>%
  rep(., each = 2) %>%
  data.frame()

list_models <- 
  depths_reads_sub |>
  group_split(virus) |>
  map( ~ lm(log_depth ~ log_viral_load, data = .))

lm_tidy <- 
  map(list_models, broom::tidy) %>%
  do.call(rbind.data.frame, .) %>%
  cbind(virus, .) %>%
  rename(virus = ".") %>%
  select(virus, term, estimate) %>%
  pivot_wider(names_from = "term", values_from = "estimate")

lm_summary <- depths_reads_sub |>
  group_by(virus) |>
  summarise(
    Intercept = lm(log_depth ~ log_viral_load)$coefficients[1],
    Coeff_x1 = lm(log_depth ~ log_viral_load)$coefficients[2],
    R2 = summary(lm(log_depth ~ log_viral_load))$r.squared,
    pvalue = summary(lm(log_depth ~ log_viral_load))$coefficients["log_viral_load", 4]
  )

lm_combined <- 
  left_join(lm_tidy, lm_summary, by = "virus")


```

#### Make plot
```{r, message=FALSE, warning=FALSE, echo=FALSE}

ggplot() +
  geom_point(data = depths_reads_sub,
             mapping = aes(Viral.load, mean_depth, color = virus)) +
  geom_abline(data = lm_combined, aes(intercept = `(Intercept)`, slope = log_viral_load), linetype="dotted") +
  geom_text(data = lm_combined,
            colour = "black",
            aes(
              x = 6*10000,
              y = 2,
              label = round(Coeff_x1, digits = 2)),
            size = 3.5) +
  geom_text(data = lm_combined,
            colour = "black",
            aes(x = 1.5*10000, 
                y = 2, 
                label = "Slope ="),
            size = 3.5) +
  geom_text(data = lm_combined,
            colour = "black",
            aes(
              x = 6*10000,
              y = 6,
              label = round(R2, digits = 2)),
            size = 3.5) +
  geom_text(data = lm_combined,
            colour = "black",
            aes(x = 2*10000, 
                y = 6, 
                label = "R2 ="),
            size = 3.5) +
  facet_wrap( ~ virus, labeller = as_labeller(facet_names)) +
  theme_few() +
  theme(
    axis.title.x = element_text(size = 10),
    panel.grid.major = element_line(linewidth=0.1, color = "gray60"),
    panel.grid.minor = element_line(linewidth=0.1, color = "lightgray"),
    axis.title.y = element_text(size = 10),
    legend.title = element_blank()
  ) +
    xlab("Genome copies") + 
  ylab("Mean Read Depth") +
  scale_color_manual(values = cols) +
  guides(color = FALSE) +
  scale_x_log10(labels = scales::trans_format("log10", scales::label_math())) + 
    scale_y_log10(labels = scales::trans_format("log10", scales::label_math()))

```

#### Final plot
```{r}
#ggsave("figures/compare_spike_ins_atcc/model_readdepth_dedup.png")

ggsave("figures/manuscript_figure_2025/PDF/Figure_1.pdf",
       width = 8,
       height = 5)

ggsave("figures/manuscript_figure_2025/PNG/Figure_1.png",
       width = 8,
       height = 5)

```


### --------------------
## Figure S1

### Read-in data / Make plots
```{r}
#metadata

metadata <-
  read.csv(
    "metadata/sampleIDs_TESpikeIn.csv",
    header = TRUE
  )

metadata2 <- metadata |>
  select(Sample.ID, Number.of.read.pairs..quality.adaptor.trimmed.) |>
  rename(Sample_id = Sample.ID) |>
  rename(QC_reads = Number.of.read.pairs..quality.adaptor.trimmed.)

#import and combine read count files
#deduplicated and non-deduplicated

counts_dedup_bt <-
  read.table(
    "data_TE/TE_sequencing_experiment_readcount_per_sample_and_virus_bowtie2_dedup_atcc_ref.tsv",
    sep = "\t",
    header = TRUE
  )

counts_nodedup_bt <-
  read.table(
    "data_TE/TE_sequencing_experiment_readcount_per_sample_and_virus_bowtie2_nodedup_atcc_ref.tsv",
    sep = "\t",
    header = TRUE
  )

counts_bt_all <- rbind(counts_dedup_bt, counts_nodedup_bt)

counts_reads <- left_join(counts_bt_all, metadata2, by = "Sample_id")

#normalise by both raw read count and genome length - should be the same as mean read depth

counts_reads_norm <- counts_reads |>
  mutate(norm_counts1 = matched / QC_reads) |>
  mutate(norm_counts2 = matched / QC_reads / length) |>
  mutate(norm_counts3 = matched / length) |>
  mutate(genome_structure = case_when((
    virus == "Human_adenovirus_40" |
      virus == "Human_betaherpesvirus"
  ) ~ "DNA",
  .default = "RNA"
  ))

####compare library capture pool####

cols3 <- c("#228833", "#AA3377")

facet_names <- c("dedup_TE" = "Deduplicated",
                 "nodedup_TE" = "Non-Deduplicated")

metadata3 <- 
  metadata |>
  select(Sample.ID, Pool.for.sequencing) |>
  rename(Sample_id = Sample.ID) |>
  rename(Pool = Pool.for.sequencing)

counts_pool <- left_join(counts_reads_norm, metadata3, by = "Sample_id")

```

### Make plots
```{r}

pool_counts_sum <- 
  counts_pool |>
  filter(Background != "p6") |>
  filter(Background != "control") |>
  ggplot(aes(
    x = as.character(Viral.load),
    y = log10(matched),
    colour = Pool
  )) +
  geom_boxplot() +
  facet_grid( ~ type, labeller = as_labeller(facet_names)) +
  theme_few() +
  theme(
    # axis.title.x = element_blank(),
    # axis.text.x = element_blank(),
    legend.title = element_blank(),
    axis.title.y = element_text(size = 10),
    panel.grid.major = element_line(linewidth=0.1, color = "gray60"),
    panel.grid.minor = element_line(linewidth=0.1, color = "lightgray")
  ) +
  ylab("Log (Viral Reads)") + xlab("Genome copies") +
  scale_color_manual(values = cols3, labels = c("Pool 1", "Pool 2")) + 
  scale_x_discrete(labels = c(bquote(10^{2}), bquote(10^{3}), bquote(10^{5})))
  

# pool_readcounts_norm <- 
#   counts_pool |>
#   filter(Background != "p6") |>
#   filter(Background != "control") |>
#   ggplot(aes(x = virus, y = log10(norm_counts1))) +
#   geom_boxplot() +
#   facet_grid(Pool ~ type) +
#   theme_few() +
#   theme(
#     axis.title.x = element_blank(),
#     # axis.text.x = element_blank(),
#     axis.title.y = element_text(size = 10),
#     panel.grid.major = element_line(linewidth=0.1, color = "gray60"),
#     panel.grid.minor = element_line(linewidth=0.1, color = "lightgray")
#   ) +
#   ylab("log10(viral reads/cleaned reads)")

pool_norm1_sum <- 
  counts_pool |>
  filter(Background != "p6") |>
  filter(Background != "control") |>
  ggplot(aes(
    x = as.character(Viral.load),
    y = log10(norm_counts1),
    colour = Pool
  )) +
  geom_boxplot() +
  facet_grid( ~ type, labeller = as_labeller(facet_names)) +
  theme_few() +
  theme(
    # axis.title.x = element_blank(),
    # axis.text.x = element_blank(),
    legend.title = element_blank(),
    axis.title.y = element_text(size = 10),
    panel.grid.major = element_line(linewidth=0.1, color = "gray60"),
    panel.grid.minor = element_line(linewidth=0.1, color = "lightgray") 
  ) +
  ylab("Log (viral reads/cleaned reads)") + xlab("Genome copies") +
  scale_color_manual(values = cols3, labels = c("Pool 1", "Pool 2")) + 
    scale_x_discrete(labels = c(bquote(10^{2}), bquote(10^{3}), bquote(10^{5})))

# pool_readcounts_norm2 <- 
#   counts_pool |>
#   filter(Background != "p6") |>
#   filter(Background != "control") |>
#   ggplot(aes(x = virus, y = log10(norm_counts2))) +
#   geom_boxplot() +
#   facet_grid(Pool ~ type) +
#   theme_few() +
#   theme(
#     axis.title.x = element_blank(),
#     # axis.text.x = element_blank(),
#     axis.title.y = element_text(size = 10),
#     panel.grid.major = element_line(linewidth=0.1, color = "gray60"),
#     panel.grid.minor = element_line(linewidth=0.1, color = "lightgray")
#   ) +
#   ylab("log10(viral reads/cleaned reads/genome length)")


pool_norm2_sum <- 
  counts_pool |>
  filter(Background != "p6") |>
  filter(Background != "control") |>
  ggplot(aes(
    x = as.character(Viral.load),
    y = norm_counts2,
    colour = Pool
  )) +
  geom_boxplot() +
  facet_grid( ~ type, labeller = as_labeller(facet_names)) +
  theme_few() +
  theme(
    # axis.title.x = element_blank(),
    # axis.text.x = element_blank(),
    legend.title = element_blank(),
    axis.title.y = element_text(size = 10),
    panel.grid.major = element_line(linewidth=0.1, color = "gray60"),
    panel.grid.minor = element_line(linewidth=0.1, color = "lightgray")
  ) +
  ylab("Log (viral reads/cleaned reads/genome length)") + xlab("Genome copies") +
  scale_color_manual(values = cols3, labels = c("Pool 1", "Pool 2")) + 
    scale_x_discrete(labels = c(bquote(10^{2}), bquote(10^{3}), bquote(10^{5})))


#import and combine read depth files

depth_dedup_bt <-
  read.table(
    "data_TE/TE_sequencing_experiment_readdepth_per_sample_and_virus_bowtie2_dedup_atcc_ref.tsv",
    sep = "\t",
    header = TRUE
  )

depth_nodedup_bt <-
  read.table(
    "data_TE/TE_sequencing_experiment_readdepth_per_sample_and_virus_bowtie2_nodedup_atcc_ref.tsv",
    sep = "\t",
    header = TRUE
  )

depths_bt_all <- rbind(depth_dedup_bt, depth_nodedup_bt)

depths_reads <- 
  left_join(depths_bt_all, metadata2, by = "Sample_id") |>
  mutate(genome_structure = case_when((
    virus == "Human_adenovirus_40" |
      virus == "Human_betaherpesvirus"
  ) ~ "DNA",
  .default = "RNA"
  ))

depths_pool <- left_join(depths_reads, metadata3, by = "Sample_id")

# pool_readdepths <- 
#   depths_pool |>
#   filter(Background != "p6") |>
#   filter(Background != "control") |>
#   ggplot(aes(x = virus, y = log10(mean_depth))) +
#   geom_boxplot() +
#   facet_grid(Pool ~ type) +
#   theme_few() +
#   theme(
#     axis.title.x = element_blank(),
#     axis.text.x = element_text(angle = 45, hjust = 1),
#     axis.title.y = element_text(size = 10),
#     panel.grid.major = element_line(linewidth=0.1, color = "gray60"),
#     panel.grid.minor = element_line(linewidth=0.1, color = "lightgray")
#   ) +
#   ylab("Log (mean read depth)")

pool_depths_sum <-
  depths_pool |>
  filter(Background != "p6") |>
  filter(Background != "control") |>
  ggplot(aes(
    x = as.character(Viral.load),
    y = log10(mean_depth),
    colour = Pool
  )) +
  geom_boxplot() +
  facet_grid( ~ type, labeller = as_labeller(facet_names)) +
  theme_few() +
  theme(
    # axis.title.x = element_blank(),
    # axis.text.x = element_blank(),
    legend.title = element_blank(),
    axis.title.y = element_text(size = 10),
    panel.grid.major = element_line(linewidth=0.1, color = "gray60"),
    panel.grid.minor = element_line(linewidth=0.1, color = "lightgray")
  ) +
  ylab("Log (mean read depth)") +
  xlab("Genome copies") + 
  scale_x_discrete(labels = c(bquote(10^{2}), bquote(10^{3}), bquote(10^{5}))) +
  scale_color_manual(values = cols3, labels = c("Pool 1", "Pool 2"))


```

### ***Final plot***
```{r}
ggarrange(
  pool_counts_sum,
  pool_depths_sum,
  nrow = 1,
  ncol = 2,
  common.legend = TRUE, 
  align = "hv"
)


#ggsave("figures/compare_spike_ins_atcc/pool_compare_viruses_sum.png",width=10,height=7)

ggsave("figures/manuscript_figure_2025/PDF/Figure_S1.pdf",
       width = 12,
       height = 4)

ggsave("figures/manuscript_figure_2025/PNG/Figure_S1.png",
       width = 12,
       height = 4)

```

### --------------------


## Figure S2

### Read counts
```{r}
### Read-in data

#ATCC genomes updated version
#October 2024

#viral load vs read count normalised using different methods, comparing deduplicated and non-deduplicated
#use bowtie2 data

####read counts/normalised read counts####

#metadata

metadata <- read.csv("metadata/sampleIDs_TESpikeIn.csv", header = TRUE)

metadata2 <- metadata |>
  select(Sample.ID, Number.of.read.pairs..quality.adaptor.trimmed.) |>
  rename(Sample_id = Sample.ID) |>
  rename(QC_reads = Number.of.read.pairs..quality.adaptor.trimmed.)

#import and combine read count files
#deduplicated and non-deduplicated

counts_dedup_bt <-
  read.table(
    "data_TE/TE_sequencing_experiment_readcount_per_sample_and_virus_bowtie2_dedup_atcc_ref.tsv",
    sep = "\t",
    header = TRUE
  )

counts_nodedup_bt <-
  read.table(
    "data_TE/TE_sequencing_experiment_readcount_per_sample_and_virus_bowtie2_nodedup_atcc_ref.tsv",
    sep = "\t",
    header = TRUE
  )

counts_bt_all <- rbind(counts_dedup_bt, counts_nodedup_bt)

counts_reads <- left_join(counts_bt_all, metadata2, by = "Sample_id")

#normalise by both raw read count and genome length - should be the same as mean read depth

counts_reads_norm <- counts_reads |>
  mutate(norm_counts1 = matched / QC_reads) |>
  mutate(norm_counts2 = matched / QC_reads / length) |>
  mutate(norm_counts3 = matched / length) |>
  mutate(genome_structure = case_when((
    virus == "Human_adenovirus_40" |
      virus == "Human_betaherpesvirus"
  ) ~ "DNA",
  .default = "RNA"
  ))

```


### Read depths
```{r}
####read depths####

#import and combine read depth files

depth_dedup_bt <-
  read_tsv(
    "data_TE/TE_sequencing_experiment_readdepth_per_sample_and_virus_bowtie2_dedup_atcc_ref.tsv"
  )

depth_nodedup_bt <-
  read_tsv(
    "data_TE/TE_sequencing_experiment_readdepth_per_sample_and_virus_bowtie2_nodedup_atcc_ref.tsv"
  )

depths_bt_all <- rbind(depth_dedup_bt, depth_nodedup_bt)

depths_reads <- 
  depths_bt_all |>
  left_join(metadata2, by = "Sample_id") |>
  mutate(genome_structure = case_when((
    virus == "Human_adenovirus_40" |
      virus == "Human_betaherpesvirus"
  ) ~ "DNA",
  .default = "RNA"
  )) |>
  rename(Viral.load = `Viral load`)

```


### Format data
```{r}
#change labels in facet plots

facet_names <- c("dedup_TE" = "Deduplicated",
                 "nodedup_TE" = "Non-Deduplicated")

cols <- c("#4477AA",
          "#66CCEE",
          "#228833",
          "#CCBB44",
          "#EE6677",
          "#AA3377")
```

### Make plots
```{r}

#plot viral read counts (non normalised)

read_count <- 
  counts_reads_norm |>
  filter(Background != "p6") |>
  filter(Background != "control") |>
  ggplot(aes(
    x = Viral.load,
    y = log10(matched),
    colour = virus
  )) +
  geom_point() +
  geom_smooth(aes(group = virus, linetype = genome_structure), se = FALSE) +
  facet_grid( ~ type, labeller = as_labeller(facet_names)) +
  theme_few() +
  theme(
    axis.title.x = element_text(size = 12),
    axis.text.x = element_text(angle = 0, hjust = 1),
    axis.title.y = element_text(size = 12),
    legend.title = element_blank(),
    legend.position = "top", 
    panel.grid.major = element_line(linewidth=0.1, color = "gray60"),
    panel.grid.minor = element_line(linewidth=0.1, color = "lightgray")
  ) +
  scale_color_manual(
    values = cols,
    labels = c(
      "Human adenovirus 40",
      "Human betaherpesvirus",
      "Human respiratory syncytial virus",
      "Influenza B virus",
      "Mammalian orthoreovirus 3",
      "Zika virus"
    )
  ) +
  scale_linetype_manual(values=c("dotted", "solid")) + 
  ylab("Log (Viral Reads)") +
  xlab("Genome copies") + 
  scale_x_log10(labels = scales::trans_format("log10", scales::label_math()))

read_count_norm1 <- 
  counts_reads_norm |>
  filter(Background != "p6") |>
  filter(Background != "control") |>
  ggplot(aes(
    x = Viral.load,
    y = log10(norm_counts1),
    colour = virus
  )) +
  geom_point() +
  geom_smooth(aes(group = virus, linetype = genome_structure), se = FALSE) +
  facet_grid( ~ type, labeller = as_labeller(facet_names)) +
  theme_few() +
  theme(
    axis.title.x = element_text(size = 12),
    axis.text.x = element_text(angle = 0, hjust = 1),
    axis.title.y = element_text(size = 12),
    legend.title = element_blank(),
    legend.position = "top", 
    panel.grid.major = element_line(linewidth=0.1, color = "gray60"),
    panel.grid.minor = element_line(linewidth=0.1, color = "lightgray")
  ) +
  scale_color_manual(
    values = cols,
    labels = c(
      "Human adenovirus 40",
      "Human betaherpesvirus",
      "Human respiratory syncytial virus",
      "Influenza B virus",
      "Mammalian orthoreovirus 3",
      "Zika virus"
    )
  ) +
  scale_linetype_manual(values=c("dotted", "solid")) + 
  ylab("Log (viral reads/cleaned reads)") +
  xlab("Genome copies") + 
  scale_x_log10(labels = scales::trans_format("log10", scales::label_math()))

#normalise by raw read count and genome length

read_count_norm2 <- 
  counts_reads_norm |>
  filter(Background != "p6") |>
  filter(Background != "control") |>
  ggplot(aes(
    x = (Viral.load),
    y = log10(norm_counts2),
    colour = virus
  )) +
  geom_point() +
  geom_smooth(aes(group = virus, linetype = genome_structure), se = FALSE) +
  facet_grid( ~ type, labeller = as_labeller(facet_names)) +
  theme_few() +
  theme(
    axis.title.x = element_text(size = 12),
    axis.text.x = element_text(angle = 0, hjust = 1),
    axis.title.y = element_text(size = 12),
    legend.title = element_blank(),
    legend.position = "top", 
    panel.grid.major = element_line(linewidth=0.1, color = "gray60"),
    panel.grid.minor = element_line(linewidth=0.1, color = "lightgray")
  ) +
  scale_color_manual(
    values = cols,
    labels = c(
      "Human adenovirus 40",
      "Human betaherpesvirus",
      "Human respiratory syncytial virus",
      "Influenza B virus",
      "Mammalian orthoreovirus 3",
      "Zika virus"
    )
  ) +
  scale_linetype_manual(values=c("dotted", "solid")) + 
  ylab("Log (viral reads/cleaned reads/genome length)") +
  xlab("Genome copies") + 
  scale_x_log10(labels = scales::trans_format("log10", scales::label_math()))


read_depths <- 
  depths_reads |>
  filter(Background != "p6") |>
  filter(Background != "control") |>
  ggplot(aes(
    x = Viral.load,
    y = log10(mean_depth),
    colour = virus
  )) +
  geom_point() +
  geom_smooth(aes(group = virus, linetype = genome_structure), se = FALSE) +
  facet_grid( ~ type, labeller = as_labeller(facet_names)) +
  theme_few() +
  theme(
    axis.title.x = element_text(size = 12),
    axis.text.x = element_text(angle = 0, hjust = 1),
    axis.title.y = element_text(size = 12),
    legend.title = element_blank(),
    legend.position = "top", 
    panel.grid.major = element_line(linewidth=0.1, color = "gray60"),
    panel.grid.minor = element_line(linewidth=0.1, color = "lightgray")
  ) +
  scale_color_manual(
    values = cols,
    labels = c(
      "Human adenovirus 40",
      "Human betaherpesvirus",
      "Human respiratory syncytial virus",
      "Influenza B virus",
      "Mammalian orthoreovirus 3",
      "Zika virus"
    )
  ) +
  scale_linetype_manual(values=c(2, 1)) +
  scale_linetype_manual(values=c("dotted", "solid")) + 
  ylab("Log (mean read depth)") +
  xlab("Genome copies") + 
  scale_x_log10(labels = scales::trans_format("log10", scales::label_math()))

#ggsave("figures/compare_spike_ins_atcc/mean_depth.pdf",width=8,height=6)

```

### ***Final plot***
```{r, warning = FALSE, echo = FALSE, message = FALSE}

ggarrange(
  read_count,
  read_count_norm1,
  read_count_norm2,
  read_depths,
  nrow = 2,
  ncol = 2,
  common.legend = TRUE,
  align = "hv"
)

#ggsave("figures/compare_spike_ins_atcc/log_read_count_depth.png",width=10,height=7)

#ggsave("figures/manuscript_figures_pdf/FigureS1.pdf",width=10,height=7)

#apparently normalised read count using method 2 should not actually be the same

ggsave("figures/manuscript_figure_2025/PDF/Figure_S2.pdf",
       width = 12,
       height = 8)

ggsave("figures/manuscript_figure_2025/PNG/Figure_S2.png",
       width = 12,
       height = 8)


```


### --------------------


## Figure S3
```{r}

##plots of read counts and viral read counts
#November 2024

#metadata

metadata <-
  read.csv(
    "metadata/sampleIDs_TESpikeIn.csv",
    header = TRUE
  )

metadata2 <- metadata |>
  select(Sample.ID, Number.of.read.pairs..quality.adaptor.trimmed.) |>
  rename(Sample_id = Sample.ID) |>
  rename(QC_reads = Number.of.read.pairs..quality.adaptor.trimmed.)

#import read count files

counts_dedup_bt <-
  read.table(
    "data_TE/TE_sequencing_experiment_readcount_per_sample_and_virus_bowtie2_dedup_atcc_ref.tsv",
    sep = "\t",
    header = TRUE
  )

counts_nodedup_bt <-
  read.table(
    "data_TE/TE_sequencing_experiment_readcount_per_sample_and_virus_bowtie2_nodedup_atcc_ref.tsv",
    sep = "\t",
    header = TRUE
  )

#import viral reads mapped (to calculate proportions)

viral_reads_dedup <-
  read.csv(
    "data_TE/total_virus_mapped_reads_per_sample_dedup_atcc_ref_20241108.csv",
    header = TRUE
  )

viral_reads_nodedup <-
  read.csv(
    "data_TE/total_virus_mapped_reads_per_sample_nodedup_atcc_ref_20241108.csv",
    header = TRUE
  )

cols2 <- c("#BB5566", "#004488")

facet_names <- c("dedup_TE" = "Deduplicated",
                 "nodedup_TE" = "Non-Deduplicated")

reads_metadata_dedup <-
  left_join(counts_dedup_bt, metadata2, by = "Sample_id")

reads_viral_dedup <-
  left_join(reads_metadata_dedup, viral_reads_dedup, by = "Sample_id")

reads_metadata_nodedup <-
  left_join(counts_nodedup_bt, metadata2, by = "Sample_id")

reads_viral_nodedup <-
  left_join(reads_metadata_nodedup, viral_reads_nodedup, by = "Sample_id")

reads_viral_all <- rbind(reads_viral_dedup, reads_viral_nodedup)


reads_plot <- 
  reads_viral_all |>
  group_by(Background, Sample_id, Viral.load, type) |>
  summarise(
    total_reads = (QC_reads * 2),
    viral_reads = total_virus_reads,
    ATCC_reads = sum(matched),
    prop_ATCC = ATCC_reads / total_reads,
    prop_viral = total_virus_reads / total_reads,
    diff = viral_reads - ATCC_reads
  ) |>
  unique()

```

### Make plots
```{r, echo=FALSE, warning=FALSE, message=FALSE}

total_reads <- 
  reads_plot |>
  filter(Background != "p6") |>
  filter(Background != "control") |>
  ggplot(aes(
    x = Viral.load,
    y = (total_reads),
    colour = Background
  )) +
  geom_point() +
  geom_smooth(aes(group = Background), se = FALSE) +
  facet_grid( ~ type, labeller = as_labeller(facet_names)) +
  theme_few() +
  theme(
    axis.title.y = element_text(size = 10),
    panel.grid.major = element_line(linewidth=0.1, color = "gray60"),
    panel.grid.minor = element_line(linewidth=0.1, color = "lightgray")
  ) +
  scale_y_log10(labels = scales::trans_format("log10", scales::label_math())) + 
    scale_x_log10(labels = scales::trans_format("log10", scales::label_math())) + xlab("Genome copies") +
  # ggtitle("All") +
  ylab("Total number of reads") +
  scale_color_manual(values = cols2, labels = c("M1", "M2"))

viral_reads <- 
  reads_plot |>
  filter(Background != "p6") |>
  filter(Background != "control") |>
  ggplot(aes(
    x = Viral.load,
    y = (viral_reads),
    colour = Background
  )) +
  geom_point() +
  geom_smooth(aes(group = Background), se = FALSE) +
  facet_grid( ~ type, labeller = as_labeller(facet_names)) +
  theme_few() +
  theme(
    axis.title.y = element_text(size = 10),
    panel.grid.major = element_line(linewidth=0.1, color = "gray60"),
    panel.grid.minor = element_line(linewidth=0.1, color = "lightgray")
  ) +
  scale_y_log10(labels = scales::trans_format("log10", scales::label_math())) + 
    scale_x_log10(labels = scales::trans_format("log10", scales::label_math())) + xlab("Genome copies") +
  # ggtitle("Viral reads") +
  ylab("Number of viral reads") +
  scale_color_manual(values = cols2, labels = c("M1", "M2"))

ATCC_reads <- 
  reads_plot |>
  filter(Background != "p6") |>
  filter(Background != "control") |>
  ggplot(aes(
    x = Viral.load,
    y = (ATCC_reads),
    colour = Background
  )) +
  geom_point() +
  geom_smooth(aes(group = Background), se = FALSE) +
  facet_grid( ~ type, labeller = as_labeller(facet_names)) +
  theme_few() +
  theme(
    axis.title.y = element_text(size = 10),
    panel.grid.major = element_line(linewidth=0.1, color = "gray60"),
    panel.grid.minor = element_line(linewidth=0.1, color = "lightgray")
  ) +
  scale_y_log10(labels = scales::trans_format("log10", scales::label_math())) + 
    scale_x_log10(labels = scales::trans_format("log10", scales::label_math())) + xlab("Genome copies") +
  ggtitle("Spike-in Viral Reads") +
  ylab("Spike-in Viral Reads") +
  scale_color_manual(values = cols2, labels = c("M1", "M2"))

prop_ATCC <- reads_plot |>
  filter(Background != "p6") |>
  filter(Background != "control") |>
  ggplot(aes(x = Viral.load, y = prop_ATCC, colour = Background)) +
  geom_point() +
  geom_smooth(aes(group = Background), se = FALSE) +
  facet_grid( ~ type, labeller = as_labeller(facet_names)) +
  theme_few() +
  theme(
    axis.title.y = element_text(size = 10),
    panel.grid.major = element_line(linewidth=0.1, color = "gray60"),
    panel.grid.minor = element_line(linewidth=0.1, color = "lightgray")
  ) +
    scale_x_log10(labels = scales::trans_format("log10", scales::label_math())) + xlab("Genome copies") +
  ggtitle("Spike-in viral reads") +
  ylab("Proportion") +
  scale_color_manual(values = cols2, labels = c("M1", "M2"))

prop_viral <- reads_plot |>
  filter(Background != "p6") |>
  filter(Background != "control") |>
  ggplot(aes(x = Viral.load, y = prop_viral, colour = Background)) +
  geom_point() +
  geom_smooth(aes(group = Background), se = FALSE) +
  facet_grid( ~ type, labeller = as_labeller(facet_names)) +
  theme_few() +
  theme(
    axis.title.y = element_text(size = 10),
    panel.grid.major = element_line(linewidth=0.1, color = "gray60"),
    panel.grid.minor = element_line(linewidth=0.1, color = "lightgray")
  ) +
 scale_x_log10(labels = scales::trans_format("log10", scales::label_math())) + xlab("Genome copies") +
  # ggtitle("Viral reads") +
  ylab("Proportion of viral reads") +
  scale_color_manual(values = cols2, labels = c("M1", "M2")) + scale_y_continuous(limits=c(0,1.0))


#ggarrange(total_reads,viral_reads,ATCC_reads,prop_viral,prop_ATCC,nrow=2,ncol=3,common.legend = TRUE)

```

### ***Final plot***
```{r, warning=FALSE, echo=FALSE, message=FALSE}

ggarrange(total_reads,
          viral_reads,
          prop_viral,
          nrow = 3,
          common.legend = TRUE,
          align = "hv")

#ggsave("figures/compare_spike_ins_atcc/backgrounds_reads.png")

ggsave("figures/manuscript_figure_2025/PDF/Figure_S3.pdf", 
       width = 8,
       height = 8)

ggsave("figures/manuscript_figure_2025/PNG/Figure_S3.png", 
       width = 8,
       height = 8)

####compare proportion viral reads per pool with shotgun####

# metadata3 <- metadata |>
#   select(Sample.ID,
#          Number.of.read.pairs..quality.adaptor.trimmed.,
#          Pool.for.sequencing) |>
#   rename(Sample_id = Sample.ID) |>
#   rename(QC_reads = Number.of.read.pairs..quality.adaptor.trimmed.)
# 
# reads_viral_dedup3 <-
#   full_join(metadata3, viral_reads_dedup, by = "Sample_id")
# 
# reads_viral_nodedup3 <-
#   full_join(metadata3, viral_reads_nodedup, by = "Sample_id")
# 
# reads_viral_dedup3 |>
#   group_by(Pool.for.sequencing) |>
#   summarise(
#     viral_reads = sum(total_virus_reads),
#     total_reads = sum(QC_reads * 2),
#     prop_viral = viral_reads / total_reads
#   )
# 
# polyomics <-
#   read.csv(
#     "data_polyomics/total_virus_mapped_reads_per_sample_dedup.csv"
#   )
# 
# #read data from polyomics_indexes_stefano document
# total_reads <- data.frame(total_reads = c(5942760, 7714275))
# 
# keeps <- c("RNA-Msp-p2", "RNA-Msp-p8")
# 
# polyomics_read_prop <- polyomics |>
#   filter(Sample_id %in% keeps)
# 
# polyomics_read_prop2 <- cbind(polyomics_read_prop, total_reads)
# 
# polyomics_read_prop2 |>
#   summarise(prop_viral = total_virus_reads / total_reads)

```

### --------------------

## No longer required? Figure S4

### Make plots
```{r}
####read counts/depths split by background####

#change labels in facet plots

facet_names_bg <- c(
  "dedup_TE" = "Deduplicated",
  "nodedup_TE" = "Non-Deduplicated",
  "p2" = "M1",
  "p8" = "M2"
)

#break down by Background x virus

background_counts_reads <- 
  counts_reads_norm |>
  filter(Background != "p6") |>
  filter(Background != "control") |>
  ggplot(aes(
    x = Viral.load,
    y = log10(matched),
    colour = virus
  )) +
  geom_point() +
  geom_smooth(aes(group = virus, linetype = genome_structure), se = FALSE) +
  facet_grid(Background ~ type, labeller = as_labeller(facet_names_bg)) +
  theme_few() +
  theme(
    axis.title.x = element_blank(),
    axis.title.y = element_text(size = 10),
    panel.grid.major = element_line(linewidth=0.1, color = "gray60"),
    panel.grid.minor = element_line(linewidth=0.1, color = "lightgray")
  ) +
  scale_color_manual(
    values = cols,
    labels =
      c(
      "Human adenovirus 40",
      "Human betaherpesvirus",
      "Human respiratory syncytial virus",
      "Influenza B virus",
      "Mammalian orthoreovirus 3",
      "Zika virus"
      )
  ) +
  scale_x_log10(labels = scales::label_log10()) +
  ylab("Log (Viral Reads)")

backgrounds_counts_reads_norm <- 
  counts_reads_norm |>
  filter(Background != "p6") |>
  filter(Background != "control") |>
  ggplot(aes(
    x = Viral.load,
    y = log10(norm_counts1),
    colour = virus
  )) +
  geom_point() +
  geom_smooth(aes(group = virus, linetype = genome_structure), se = FALSE) +
  facet_grid(Background ~ type, labeller = as_labeller(facet_names_bg)) +
  theme_bw() +
  theme_few() +
  theme(
    axis.title.x = element_blank(),
    axis.title.y = element_text(size = 10),
    panel.grid.major = element_line(linewidth=0.1, color = "gray60"),
    panel.grid.minor = element_line(linewidth=0.1, color = "lightgray")
  ) +
  scale_color_manual(
    values = cols,
    labels = c(
      "Human adenovirus 40",
      "Human betaherpesvirus",
      "Human respiratory syncytial virus",
      "Influenza B virus",
      "Mammalian orthoreovirus 3",
      "Zika virus"
    )
  ) +
  scale_x_log10(label = scales::label_log10()) +
  ylab("Log (viral reads/cleaned reads)")

backgrounds_counts_reads_norm2 <- 
  counts_reads_norm |>
  filter(Background != "p6") |>
  filter(Background != "control") |>
  ggplot(aes(
    x = Viral.load,
    y = log10(norm_counts2),
    colour = virus
  )) +
  geom_point() +
  geom_smooth(aes(group = virus, linetype = genome_structure), se = FALSE) +
  facet_grid(Background ~ type, labeller = as_labeller(facet_names_bg)) +
  theme_few() +
  theme(
    axis.title.x = element_blank(),
    axis.title.y = element_text(size = 10),
    panel.grid.major = element_line(linewidth=0.1, color = "gray60"),
    panel.grid.minor = element_line(linewidth=0.1, color = "lightgray")
  ) +
  scale_color_manual(
    values = cols,
    labels = c(
      "Human adenovirus 40",
      "Human betaherpesvirus",
      "Human respiratory syncytial virus",
      "Influenza B virus",
      "Mammalian orthoreovirus 3",
      "Zika virus"
    )
  ) +
  scale_x_log10(labels = scales::label_log10()) +
  ylab("Log (viral reads/cleaned reads/genome length)")

background_read_depths <- 
  depths_reads |>
  filter(Background != "p6") |>
  filter(Background != "control") |>
  ggplot(aes(
    x = Viral.load,
    y = log10(mean_depth),
    colour = virus
  )) +
  geom_point() +
  geom_smooth(aes(group = virus, linetype = genome_structure), se = FALSE) +
  facet_grid(Background ~ type, labeller = as_labeller(facet_names_bg)) +
  theme_few() +
  theme(
    axis.title.x = element_blank(),
    panel.grid.major = element_line(linewidth=0.1, color = "gray60"),
    panel.grid.minor = element_line(linewidth=0.1, color = "lightgray"),
    axis.title.y = element_text(size = 10),
    legend.title = element_blank()
  ) +
  scale_color_manual(
    values = cols,
    labels = c(
      "Human adenovirus 40",
      "Human betaherpesvirus",
      "Human respiratory syncytial virus",
      "Influenza B virus",
      "Mammalian orthoreovirus 3",
      "Zika virus"
    )
  ) +
  scale_x_log10(label = scales::label_log10()) +
  ylab("Log (mean read depth)")

```

### ***Final plot*** 
```{r, warning=FALSE, message=FALSE, echo=FALSE, fig.height=6, fig.height=8}

ggarrange(
  background_counts_reads,
  backgrounds_counts_reads_norm,
  backgrounds_counts_reads_norm2,
  background_read_depths,
  nrow = 2,
  ncol = 2,
  common.legend = TRUE,
  align = "hv"
)

#ggsave("figures/compare_spike_ins_atcc/backgrounds_read_count_depth.png",width=10,height=7)

#ggsave("figures/manuscript_figures_pdf/FigureS4.pdf",width=10,height=7)

```


### --------------------
### --------------------

## >>> Figure 2: plots of TE experiment data - genome coverage & per site coverage

### Read-in data
```{r}

#plots of TE experiment data - genome coverage & per site coverage
#also separated by viruses
#ATCC genomes updated version
#October 2024

#use bowtie2 data

####genome coverage####

unzip(zipfile = "data_TE/TE_sequencing_experiment_readdepth_per_site_sample_and_virus_bowtie2_dedup_atcc_ref.tsv.zip", exdir = "data_TE/")

unzip(zipfile = "data_TE/TE_sequencing_experiment_readdepth_per_site_sample_and_virus_bowtie2_nodedup_atcc_ref.tsv.zip", exdir = "data_TE/")

dedup_per_site <-
  read_tsv(
    "data_TE/TE_sequencing_experiment_readdepth_per_site_sample_and_virus_bowtie2_dedup_atcc_ref.tsv"
  )


nodedup_per_site <-
  read_tsv(
    "data_TE/TE_sequencing_experiment_readdepth_per_site_sample_and_virus_bowtie2_nodedup_atcc_ref.tsv"
  )

file.remove("data_TE/TE_sequencing_experiment_readdepth_per_site_sample_and_virus_bowtie2_dedup_atcc_ref.tsv")

file.remove( "data_TE/TE_sequencing_experiment_readdepth_per_site_sample_and_virus_bowtie2_nodedup_atcc_ref.tsv")

#add length column from read depth file to per site file

#metadata

metadata <-
  read.csv(
    "metadata/sampleIDs_TESpikeIn.csv",
    header = TRUE
  )

metadata2 <- 
  metadata |>
  select(Sample.ID, Number.of.read.pairs..quality.adaptor.trimmed.) |>
  rename(Sample_id = Sample.ID) |>
  rename(QC_reads = Number.of.read.pairs..quality.adaptor.trimmed.)

#import and combine read count files

#deduplicated and non-deduplicated

counts_dedup_bt <-
  read_tsv(
    "data_TE/TE_sequencing_experiment_readcount_per_sample_and_virus_bowtie2_dedup_atcc_ref.tsv"
  )

counts_nodedup_bt <-
  read_tsv(
    "data_TE/TE_sequencing_experiment_readcount_per_sample_and_virus_bowtie2_nodedup_atcc_ref.tsv"
  )

counts_bt_all <- rbind(counts_dedup_bt, counts_nodedup_bt)

counts_reads <- left_join(counts_bt_all, metadata2, by = "Sample_id")

```

### Format data
```{r}
#normalise by both raw read count and genome length - should be the same as mean read depth

counts_reads_norm <- 
  counts_reads |>
  mutate(norm_counts1 = matched / QC_reads) |>
  mutate(norm_counts2 = matched / QC_reads / length) |>
  mutate(norm_counts3 = matched / length) |>
  mutate(genome_structure = case_when((
    virus == "Human_adenovirus_40" |
      virus == "Human_betaherpesvirus"
  ) ~ "DNA",
  .default = "RNA"
  ))

lengths <- 
  counts_reads_norm |>
  select(virus, length) |>
  distinct()

#calculate genome coverage

persite_coverage_dedup <- 
  dedup_per_site |>
  rename(Viral.load = `Viral load`) |>
  group_by(Sample_id, virus, Viral.load, Background, type) |>
  filter(coverage > 0) |>
  summarise(genome_coverage = n()) |>
  left_join(lengths) |>
  mutate(percent_coverage = genome_coverage / length)

persite_coverage_nodedup <- 
  nodedup_per_site |>
  rename(Viral.load = `Viral load`) |>
  group_by(Sample_id, virus, Viral.load, Background, type) |>
  filter(coverage > 0) |>
  summarise(genome_coverage = n()) |>
  left_join(lengths) |>
  mutate(percent_coverage = genome_coverage / length)

persite_coverage_both <-
  rbind(persite_coverage_dedup, 
        persite_coverage_nodedup)

coverage_labels <- c("0" = "Control",
                     "100" = "10^{2}",
                     "1000" = "10^{3}",
                     "10000" = "10^{5}",
                     "dedup_TE" = "Deduplicated",
                     "nodedup_TE"= "Non-Deduplicated")

coverage_labels2 <- c("0" = "Control",
                      "100" = "10^{2}",
                      "1000" = "10^{3}",
                      "10000" = "10^{5}")
```

### Make plots
```{r, echo = FALSE, warning=FALSE, message=FALSE, fig.height=4, fig.width=6}

cols2 <- c("#BB5566",
           "#004488")

cols4 <- c("#DDAA33",
           "#BB5566",
           "#004488")


persite_coverage_both$Viral.load <- factor(persite_coverage_both$Viral.load, labels = c("0",
    "10^{2}", 
    "10^{3}", 
    "10^{5}"
    ))


##are the deduplicated and non deduplicated datasets identical??
##if they are the same can maybe just show one
#show deduplicated

persite_coverage_both |>
  filter(Background != "p6") |>
  filter(Background != "control") |>
  filter(type == "dedup_TE") |>
  ggplot(aes(x = virus, y = percent_coverage, colour = Background)) +
  geom_boxplot(outlier.shape = NA) +
  geom_point(position = position_dodge(width = .75)) +
  facet_grid(~as.character(Viral.load), labeller = label_parsed) +
  theme_few() +
  ylab("Genome coverage") +
  theme(
    axis.title.x = element_text(size = 12),
    axis.text.x = element_text(angle = 30, hjust = 1),
    axis.title.y = element_text(size = 12),
    legend.title = element_blank(),
    legend.position = "top", 
    panel.grid.major = element_line(linewidth=0.1, color = "gray60"),
    panel.grid.minor = element_line(linewidth=0.1, color = "lightgray")
  ) +
  scale_color_manual(values = cols2, labels = c("M1", "M2")) +
  scale_x_discrete(
    labels = c(
      "Human adenovirus 40",
      "Human betaherpesvirus",
      "Human respiratory syncytial virus",
      "Influenza B virus",
      "Mammalian orthoreovirus 3",
      "Zika virus"
    )
  ) +
  scale_y_continuous(limits = c(0, 1))

```

### ***Final plot***
```{r, echo = FALSE, warning=FALSE, message=FALSE}
#ggsave("figures/compare_spike_ins_atcc/genome_cov_deduponly.png",width=15,height=6)

ggsave(
  "figures/manuscript_figure_2025/PDF/Figure_2.pdf",
  width = 15,
  height = 6
)

ggsave(
  "figures/manuscript_figure_2025/PNG/Figure_2.png",
  width = 15,
  height = 6
)


```


### --------------------


### --------------------

### Figure S4
```{r}

persite_both <- rbind(dedup_per_site, nodedup_per_site)

persite_reads <- left_join(persite_both, metadata2, by = "Sample_id")

persite_norm <- persite_reads |>
  mutate(norm_cov = coverage / QC_reads)

coverage_labels3 <-c("100" = "1e+02",
                    "1000" = "1e+03",
                    "100000" = "1e+05",
                    "p2" = "M1",
                    "p8"= "M2")

persite_norm$`Viral load` <- 
  factor(persite_norm$`Viral load`, 
         labels = c("0",
                    "10^{2}", 
                    "10^{3}",
                    "10^{5}"))


persite_norm |>
  filter(Background != "p6") |>
  filter(Background != "control") |>
  filter(type == "dedup_TE") |>
  filter(virus == "Human_respiratory_syncytial_virus") |>
  rename(Viral.load = `Viral load`) |>
  ggplot(aes(x = site, y = coverage, fill = Background)) +
  geom_col() +
  facet_wrap(Viral.load ~ Sample_id, label = label_parsed) +
  ylab("Coverage") +
  ggtitle("Human RSV (deduplicated)") +
  guides(fill = guide_legend(title = "Background")) +
  theme_few() +
  theme(
    axis.title.x = element_text(size = 12),
    axis.text.x = element_text(angle = 30, hjust = 1),
    axis.title.y = element_text(size = 12),
    legend.title = element_blank(),
    legend.position = "top", 
    panel.grid.major = element_line(linewidth=0.1, color = "gray60"),
    panel.grid.minor = element_line(linewidth=0.1, color = "lightgray")
  ) +
  scale_y_log10() +
  scale_fill_manual(values = cols2, labels = c("M1", "M2"))
```

#### ***Final plot***
```{r}
ggsave("figures/manuscript_figure_2025/PDF/Figure_S4.pdf",
       width=15,
       height=10)

ggsave("figures/manuscript_figure_2025/PNG/Figure_S4.png",
       width=15,
       height=10)
```


### --------------------

### Figure S5
```{r, warning=FALSE, message=FALSE, echo=FALSE}

persite_norm |>
  filter(Background != "p6") |>
  filter(Background != "control") |>
  filter(type == "dedup_TE") |>
  filter(virus == "Zika_virus") |>
  rename(Viral.load = `Viral load`) |>
  ggplot(aes(x = site, y = coverage, fill = Background)) +
  geom_col() +
  facet_wrap(Viral.load ~ Sample_id, label = label_parsed) +
  ylab("Coverage") +
  ggtitle("Zika virus (deduplicated)") +
  guides(fill = guide_legend(title = "Background")) +
  theme_few() +
  theme(
    axis.title.x = element_text(size = 12),
    axis.text.x = element_text(angle = 30, hjust = 1),
    axis.title.y = element_text(size = 12),
    legend.title = element_blank(),
    legend.position = "top", 
    panel.grid.major = element_line(linewidth=0.1, color = "gray60"),
    panel.grid.minor = element_line(linewidth=0.1, color = "lightgray")
  ) +
  scale_y_log10() +
  scale_fill_manual(values = cols2, labels = c("M1", "M2"))

```

####***Final plot***
```{r}

ggsave("figures/manuscript_figure_2025/PDF/Figure_S5.pdf",
       width=15,
       height=10)

ggsave("figures/manuscript_figure_2025/PNG/Figure_S5.png",
       width=15,
       height=10)

```


### --------------------

### Figure S6
```{r, warning=FALSE, message=FALSE, echo=FALSE}

persite_norm |>
  filter(Background != "p6") |>
  filter(Background != "control") |>
  filter(type == "dedup_TE") |>
  filter(virus == "Human_adenovirus_40") |>
  rename(Viral.load = `Viral load`) |>
  ggplot(aes(x = site, y = coverage, fill = Background)) +
  geom_col() +
  facet_wrap(Viral.load ~ Sample_id, label = label_parsed) +
  ylab("Coverage") +
  ggtitle("Human Adenovirus (deduplicated)") +
  guides(fill = guide_legend(title = "Background")) +
 theme_few() +
  theme(
    axis.title.x = element_text(size = 12),
    axis.text.x = element_text(angle = 30, hjust = 1),
    axis.title.y = element_text(size = 12),
    legend.title = element_blank(),
    legend.position = "top", 
    panel.grid.major = element_line(linewidth=0.1, color = "gray60"),
    panel.grid.minor = element_line(linewidth=0.1, color = "lightgray")
  ) +
  scale_y_log10() +
  scale_fill_manual(values = cols2, labels = c("M1", "M2"))

```

####***Final plot***
```{r}

ggsave("figures/manuscript_figure_2025/PDF/Figure_S6.pdf",
       width=15,
       height=10)

ggsave("figures/manuscript_figure_2025/PNG/Figure_S6.png",
       width=15,
       height=10)

```


### --------------------

### Figure S7
```{r}
HBV <- 
persite_norm |>
  filter(Background != "p6") |>
  filter(Background != "control") |>
  filter(type == "dedup_TE") |>
  filter(virus == "Human_betaherpesvirus") |>
  rename(Viral.load = `Viral load`) |>
  ggplot(aes(x = site, y = coverage, fill = Background)) +
  geom_col() +
  facet_wrap(Viral.load ~ Sample_id, label = label_parsed) +
  ylab("Coverage") +
  ggtitle("Human Betaherpesvirus (deduplicated)") +
  guides(fill = guide_legend(title = "Background")) +
 theme_few() +
  theme(
    axis.title.x = element_text(size = 12),
    axis.text.x = element_text(angle = 30, hjust = 1),
    axis.title.y = element_text(size = 12),
    legend.title = element_blank(),
    legend.position = "top", 
    panel.grid.major = element_line(linewidth=0.1, color = "gray60"),
    panel.grid.minor = element_line(linewidth=0.1, color = "lightgray")
  ) +
  scale_y_log10() +
  scale_fill_manual(values = cols2, labels = c("M1", "M2"))

```

####***Final plot***
```{r}

ggsave(plot = HBV, file="figures/manuscript_figure_2025/PDF/Figure_S7.pdf",
       width=15,
       height=10,
       dpi = 600)

ggsave(plot = HBV, file="figures/manuscript_figure_2025/PNG/Figure_S7.png",
       width=15,
       height=10)


```



### --------------------

### Figure S8
```{r}
#have to do these differently due to segmentation

FLU_ME1 <- persite_norm |>
  filter(virus == "Influenza_B_virus") |>
  filter(type == "dedup_TE") |>
  filter(Background == "p2") |>
  rename(Viral.load = `Viral load`) |>
  ggplot(aes(
    x = site,
    y = coverage,
    fill = as.character(Viral.load)
  )) +
  geom_col() +
  facet_grid(seg ~ Sample_id) +
  ylab("Coverage") +
  ggtitle("Influenza B virus (Background M1 deduplicated)") +
  guides(fill = guide_legend(title = "Viral load")) +
 theme_few() +
  theme(
    axis.title.x = element_text(size = 12),
    axis.text.x = element_text(angle = 30, hjust = 1),
    axis.title.y = element_text(size = 12),
    legend.title = element_blank(),
    legend.position = "top", 
    panel.grid.major = element_line(linewidth=0.1, color = "gray60"),
    panel.grid.minor = element_line(linewidth=0.1, color = "lightgray")
  ) +
  scale_y_log10() +
  scale_fill_manual(values = cols4, label = c(expression("10"^"2"), expression("10"^"3"), expression("10"^"5")))

FLU_ME2 <- persite_norm |>
  filter(virus == "Influenza_B_virus") |>
  filter(type == "dedup_TE") |>
  filter(Background == "p8") |>
  rename(Viral.load = `Viral load`) |>
  ggplot(aes(
    x = site,
    y = coverage,
    fill = as.character(Viral.load)
  )) +
  geom_col() +
  facet_grid(seg ~ Sample_id) +
  ylab("Coverage") +
  ggtitle("Influenza B virus (Background M2 deduplicated)") +
  guides(fill = guide_legend(title = "Viral load")) +
 theme_few() +
  theme(
    axis.title.x = element_text(size = 12),
    axis.text.x = element_text(angle = 30, hjust = 1),
    axis.title.y = element_text(size = 12),
    legend.title = element_blank(),
    legend.position = "top", 
    panel.grid.major = element_line(linewidth=0.1, color = "gray60"),
    panel.grid.minor = element_line(linewidth=0.1, color = "lightgray")
  ) +
  scale_y_log10() +
  scale_fill_manual(values = cols4, label = c(expression("10"^"2"), expression("10"^"3"), expression("10"^"5")))


ggarrange(FLU_ME1, FLU_ME2, ncol = 2, common.legend = TRUE)

```

####***Final plot***
```{r}

ggsave("figures/manuscript_figure_2025/PDF/Figure_S8.pdf",
       width=20,
       height=12,
       dpi=600)

ggsave("figures/manuscript_figure_2025/PNG/Figure_S8.png",
       width=20,
       height=12)
```


### --------------------

### Figure S9
```{r}

REO_ME1 <- 
  persite_norm |>
  filter(virus == "Mammalian_orthoreovirus3") |>
  filter(type == "dedup_TE") |>
  filter(Background == "p2") |>
  rename(Viral.load = `Viral load`) |>
  ggplot(aes(
    x = site,
    y = coverage,
    fill = as.character(Viral.load)
  )) +
  geom_col() +
  facet_grid(seg ~ Sample_id) +
  ylab("Coverage") +
  ggtitle("Mammalian orthoreovirus 3 (Background M1 deduplicated)") +
  guides(fill = guide_legend(title = "Viral load")) +
  theme_few() +
  theme(
    axis.title.x = element_text(size = 12),
    axis.text.x = element_text(angle = 30, hjust = 1),
    axis.title.y = element_text(size = 12),
    legend.title = element_blank(),
    legend.position = "top",
    panel.grid.major = element_line(linewidth = 0.1, color = "gray60"),
    panel.grid.minor = element_line(linewidth = 0.1, color = "lightgray")
  ) +
  scale_y_log10() +
  scale_fill_manual(values = cols4,
                    label = c(
                      expression("10" ^ "2"),
                      expression("10" ^ "3"),
                      expression("10" ^ "5")
                    ))

REO_ME2 <- 
  persite_norm |>
  filter(virus == "Mammalian_orthoreovirus3") |>
  filter(type == "dedup_TE") |>
  filter(Background == "p8") |>
  rename(Viral.load = `Viral load`) |>
  ggplot(aes(
    x = site,
    y = coverage,
    fill = as.character(Viral.load)
  )) +
  geom_col() +
  facet_grid(seg ~ Sample_id) +
  ylab("Coverage") +
  ggtitle("Mammalian orthoreovirus 3 (Background M2 deduplicated)") +
  guides(fill = guide_legend(title = "Viral load")) +
  theme_few() +
  theme(
    axis.title.x = element_text(size = 12),
    axis.text.x = element_text(angle = 30, hjust = 1),
    axis.title.y = element_text(size = 12),
    legend.title = element_blank(),
    legend.position = "top",
    panel.grid.major = element_line(linewidth = 0.1, color = "gray60"),
    panel.grid.minor = element_line(linewidth = 0.1, color = "lightgray")
  ) + scale_y_log10() +
  scale_fill_manual(values = cols4,
                    label = c(
                      expression("10" ^ "2"),
                      expression("10" ^ "3"),
                      expression("10" ^ "5")
                    ))

ggarrange(REO_ME1, REO_ME2, ncol = 2, common.legend = TRUE)

```

####***Final plot***
```{r}
ggsave("figures/manuscript_figure_2025/PDF/Figure_S9.pdf",
       width=20,
       height=12,
       dpi=600)

ggsave("figures/manuscript_figure_2025/PNG/Figure_S9.png",
       width=20,
       height=12,
       dpi=600)
```




### --------------------

## >>>> Figure 3: all viruses - target and off target

### Read-in data
```{r}

# read in metadata

metadata <-
  read.csv("metadata/sampleIDs_TESpikeIn.csv", header = TRUE)

metadata2 <-
  metadata |>
  select(
    Sample.ID,
    Background.sample,
    Viral.load,
    Number.of.read.pairs..quality.adaptor.trimmed.
  ) |>
  rename(Sample_id = Sample.ID) |>
  rename(QC_reads = Number.of.read.pairs..quality.adaptor.trimmed.)

#import viral reads mapped (to calculate proportions)

viral_reads_dedup <-
  read.csv(
    "data_TE/total_virus_mapped_reads_per_sample_dedup_atcc_ref_20241108.csv",
    header = TRUE
  )

viral_reads_nodedup <-
  read.csv(
    "data_TE/total_virus_mapped_reads_per_sample_nodedup_atcc_ref_20241108.csv",
    header = TRUE
  )

#import contingency tables

contingency_dedup <-
  read.csv(
    "data_TE/contingency_table_mapped_virus_reads_per_family_genus_sample_dedup_atcc_ref_20241106.csv",
    header = TRUE
  )

contingency_nodedup <-
  read.csv(
    "data_TE/contingency_table_mapped_virus_reads_per_family_genus_sample_nodedup_atcc_ref_20241106.csv",
    header = TRUE
  )

contaminants <-
  c("Betacoronavirus", "Alphainfluenzavirus", "Gammaretrovirus")

```

### Format data
```{r}

# pivot longer  
dedup_long <-
  contingency_dedup |>
  filter(!genus %in% contaminants) |>
  filter(genus != "NA") |>
  pivot_longer(cols = A:P,
               names_to = "Sample_id",
               values_to = "count") |>
  mutate(
    target = case_when(
      genus == "Cyclovirus" ~ "off_target",
      genus == "Cardiovirus" ~ "off_target",
      genus == "Kobuvirus" ~ "off_target",
      .default = "on_target"
    )
  ) |>
  mutate(genome_structure = 
           case_when((genus == "Mastadenovirus" | 
                        genus == "Cytomegalovirus" | 
                        genus == "Cyclovirus" ~ "DNA"),
                     .default = "RNA"
  ))

no_dedup_long <- contingency_nodedup |>
  filter(!genus %in% contaminants) |>
  filter(genus != "NA") |>
  pivot_longer(cols = A:P,
               names_to = "Sample_id",
               values_to = "count") |>
  mutate(
    target = case_when(
      genus == "Cyclovirus" ~ "off_target",
      genus == "Cardiovirus" ~ "off_target",
      genus == "Kobuvirus" ~ "off_target",
      .default = "on_target"
    )
  ) |>
  mutate(genome_structure = 
           case_when((genus == "Mastadenovirus" | 
                        genus == "Cytomegalovirus" | 
                        genus == "Cyclovirus" ~ "DNA"),
                     .default = "RNA"
  ))

metadata_dedup <- left_join(dedup_long, metadata2, by = "Sample_id")

dedup_viral_reads <-
  left_join(metadata_dedup, viral_reads_dedup, by = "Sample_id") |>
  mutate(total_reads = QC_reads * 2) |>
  mutate(prop_total = count / total_reads) |>
  mutate(prop_viral = count / total_virus_reads)

metadata_no_dedup <- left_join(no_dedup_long, metadata2, by = "Sample_id")

nodedup_viral_reads <-
  left_join(metadata_no_dedup, viral_reads_nodedup, by = "Sample_id") |>
  mutate(total_reads = QC_reads * 2) |>
  mutate(prop_total = count / total_reads) |>
  mutate(prop_viral = count / total_virus_reads)

```

### Make plots
```{r, warning=FALSE, message=FALSE, echo=FALSE}

facet_names <- c(
  "Msp_p2" = "M1",
  "Msp_p8" = "M2",
  "off_target" = "Background",
  "on_target" = "Spike-in"
)

cols <-
  c(
    "#CCBB44",
    "#332288",
    "#EE7733",
    "#66CCEE",
    "#882255",
    "#4477AA",
    "#AA3377",
    "#228833",
    "#EE6677"
  )


dedup_reads <- 
  dedup_viral_reads |>
  filter(Background.sample != "Neg_control") |>
  filter(Background.sample != "Msp_p6") |>
  ggplot(aes(
    x = Viral.load,
    y = (count),
    colour = genus
  )) +
  geom_point() +
  geom_smooth(aes(group = genus, linetype = genome_structure), se = FALSE) +
  facet_grid(target ~ Background.sample, labeller = as_labeller(facet_names)) +
  theme_few() + 
  scale_x_log10() +
  scale_y_log10() +
  ylab("Viral Reads") + 
  xlab("Genome copies") + 
  scale_color_manual(values = cols) +
  scale_linetype_manual(values=c("dotted", "solid")) + 
  scale_x_log10(labels = scales::trans_format("log10", scales::label_math())) + 
    scale_y_log10(labels = scales::trans_format("log10", scales::label_math())) +
  theme(
    axis.title.x = element_text(size = 12),
    # axis.text.x = element_text(hjust = 1),
    axis.title.y = element_text(size = 12),
    legend.title = element_blank(),
    legend.position = "top", 
    panel.grid.major = element_line(linewidth=0.1, color = "gray60"),
    panel.grid.minor = element_line(linewidth=0.1, color = "lightgray")
  )

dedup_prop_viral <-
  dedup_viral_reads |>
  filter(Background.sample != "Neg_control") |>
  filter(Background.sample != "Msp_p6") |>
  ggplot(aes(x = Viral.load, y = prop_viral, colour = genus)) +
  geom_point() +
  geom_smooth(aes(group = genus, linetype = genome_structure), se = FALSE) +
  facet_grid(target ~ Background.sample, labeller = as_labeller(facet_names)) +
  theme_few() + 
  scale_x_log10() +
  scale_y_log10() +
  scale_x_log10(labels = scales::trans_format("log10",
                                              scales::label_math())) + 
  scale_y_log10(labels = scales::trans_format("log10",
                                              scales::label_math())) +
  scale_linetype_manual(values=c("dotted", "solid")) + 
  ylab("Proportion of viral reads") +
  xlab("Genome copies") + 
  scale_color_manual(values = cols) +
    theme(
    axis.title.x = element_text(size = 12),
    # axis.text.x = element_text(angle = 30, hjust = 1),
    axis.title.y = element_text(size = 12),
    legend.title = element_blank(),
    legend.position = "top", 
    panel.grid.major = element_line(linewidth=0.1, color = "gray60"),
    panel.grid.minor = element_line(linewidth=0.1, color = "lightgray")
  )

dedup_prop_all <-
  dedup_viral_reads |>
  filter(Background.sample != "Neg_control") |>
  filter(Background.sample != "Msp_p6") |>
  ggplot(aes(x = Viral.load, y = prop_total, colour = genus)) +
  geom_point() +
  geom_smooth(aes(group = genus, linetype = genome_structure), se = FALSE) +
  facet_grid(target ~ Background.sample, labeller = as_labeller(facet_names)) +
  theme_few() + 
  scale_x_log10(labels = scales::trans_format("log10",
                                              scales::label_math())) + 
  scale_y_log10(limits = c(5e-8, 1), breaks = 10**(seq(-6,0,2)),labels = scales::trans_format("log10",
                                              scales::label_math())) +
  scale_linetype_manual(values=c("dotted", "solid")) + 
  ylab("Proportion of total reads") +
  xlab("Genome copies") + 
  scale_color_manual(values = cols) +
    theme(
    axis.title.x = element_text(size = 12),
    # axis.text.x = element_text(angle = 30, hjust = 1),
    axis.title.y = element_text(size = 12),
    axis.text.y = element_text(hjust = 1),
    legend.title = element_blank(),
    legend.position = "top", 
    panel.grid.major = element_line(linewidth=0.1, color = "gray60"),
    panel.grid.minor = element_line(linewidth=0.1, color = "lightgray")
  )


```

### Dedup -- viral reads, prop all reads, and prop viral reads
```{r, warning=FALSE, message=FALSE, echo=FALSE, fig.width=8, fig.height=12}

ggarrange(
  dedup_reads,
  dedup_prop_all,
  dedup_prop_viral,
  nrow = 3,
  common.legend = TRUE,
  align = "hv"
)

```

### ***Final plot***

```{r, warning=FALSE, message=FALSE, echo=FALSE, fig.width=7, fig.height=8}


##just plot viral reads and prop viral

ggarrange(
  dedup_reads,
  dedup_prop_viral,
  nrow = 2,
  common.legend = TRUE,
  align = "hv"
)

# ggsave("figures/compare_spike_ins_atcc/target_offtarget_dedup_2025-01-01.png", width = 10, height = 8)

ggsave("figures/manuscript_figure_2025/PDF/Figure_3.pdf", 
       width = 10, height = 8)

ggsave("figures/manuscript_figure_2025/PNG/Figure_3.png", 
       width = 10, height = 8)
```
### .........................
### .........................

### ----Genome medicine results----
### Figure S10

#### Read-in data
```{r}


####read counts/normalised read counts####

#import and combine read count files

gm_readcount_dedup <-
  read.table(
    "data_genome_medicine/genome_medicine_TE_sequencing_experiment_readcount_per_sample_and_virus_bowtie2_dedup_atcc_ref.tsv",
    sep = "\t",
    header = TRUE
  )

gm_readcount_nodedup <-
  read.table(
    "data_genome_medicine/genome_medicine_TE_sequencing_experiment_readcount_per_sample_and_virus_bowtie2_nodedup_atcc_ref.tsv",
    sep = "\t",
    header = TRUE
  )

gm_counts_all <- rbind(gm_readcount_dedup, gm_readcount_nodedup)

```

#### read counts 
```{r, echo=FALSE, message=FALSE, warning=FALSE}
#normalise by both raw read count and genome length - should be the same as mean read depth

facet_names <- c("dedup_GM_TE" = "Deduplicated",
                 "nodedup_GM_TE" = "Non-Deduplicated")

cols <- c("#4477AA",
          "#66CCEE",
          "#228833",
          "#CCBB44",
          "#EE6677",
          "#AA3377")

gm_counts_norm <- 
  gm_counts_all |>
  mutate(norm_counts3 = matched / length)

read_counts <- 
  gm_counts_norm |>
  filter(Viral.load != 0) |>
  ggplot(aes(
    x = Viral.load,
    y = (matched),
    colour = virus
  )) +
  geom_point() +
  geom_smooth(aes(group = virus), se = FALSE, span = 0.85) +
  facet_grid( ~ type, labeller = as_labeller(facet_names)) +
  theme_few() +
  theme(
    axis.title.x = element_text(size = 12),
    # axis.text.x = element_text(angle = 30, hjust = 1),
    axis.title.y = element_text(size = 12),
    legend.title = element_blank(),
    legend.position = "top",
    panel.grid.major = element_line(linewidth = 0.1, color = "gray60"),
    panel.grid.minor = element_line(linewidth = 0.1, color = "lightgray")
  ) +
  scale_color_manual(
    values = cols,
    labels = c(
      "Human adenovirus 40",
      "Human betaherpesvirus",
      "Human respiratory syncytial virus",
      "Influenza B virus",
      "Mammalian orthoreovirus 3",
      "Zika virus"
    )
  ) +
  ylab("Viral Reads") +
  xlab("Genome copies") + 
  scale_x_log10(labels = scales::trans_format("log10",
                                              scales::label_math())) + 
  scale_y_log10(labels = scales::trans_format("log10",
                                              scales::label_math()))

```

#### read depth
```{r}
#import and combine read depth files

depth_dedup_gm <-
  read.table(
    "data_genome_medicine/genome_medicine_TE_sequencing_experiment_readdepth_per_sample_and_virus_bowtie2_dedup_atcc_ref.tsv",
    sep = "\t",
    header = TRUE
  )

depth_nodedup_gm <-
  read.table(
    "data_genome_medicine/genome_medicine_TE_sequencing_experiment_readdepth_per_sample_and_virus_bowtie2_nodedup_atcc_ref.tsv",
    sep = "\t",
    header = TRUE
  )

depths_gm_all <- rbind(depth_dedup_gm, depth_nodedup_gm)

```

```{r, echo=FALSE, warning=FALSE, message=FALSE}

read_depths <- 
  depths_gm_all |>
  filter(Viral.load != 0) |>
  ggplot(aes(
    x = Viral.load,
    y = mean_depth,
    colour = virus
  )) +
  geom_point() +
  geom_smooth(aes(group = virus), se = FALSE, span = 0.9) +
  facet_grid( ~ type, labeller = as_labeller(facet_names)) +
  theme_few() +
  theme(
    axis.title.x = element_text(size = 12),
    # axis.text.x = element_text(angle = 30, hjust = 1),
    axis.title.y = element_text(size = 12),
    legend.title = element_blank(),
    legend.position = "top",
    panel.grid.major = element_line(linewidth = 0.1, color = "gray60"),
    panel.grid.minor = element_line(linewidth = 0.1, color = "lightgray")
  ) +
  scale_color_manual(
    values = cols,
    labels = c(
      "Human adenovirus 40",
      "Human betaherpesvirus",
      "Human respiratory syncytial virus",
      "Influenza B virus",
      "Mammalian orthoreovirus 3",
      "Zika virus"
    )
  ) +
  ylab("Mean read depth") +
  xlab("Genome copies") + 
  scale_x_log10(labels = scales::trans_format("log10",
                                              scales::label_math())) + 
  scale_y_log10(labels = scales::trans_format("log10",
                                              scales::label_math()))



```

```{r, echo=FALSE, message=FALSE, warning=FALSE}

ggarrange(read_counts,
          read_depths,
          nrow = 2,
          common.legend = TRUE)


#PDF
ggsave("figures/manuscript_figure_2025/PDF/Figure_S10.pdf", 
       width = 7,
       height = 7)

#PNG
ggsave("figures/manuscript_figure_2025/PNG/Figure_S10.png", 
       width = 7,
       height = 7)

```
### --------------------

### Figure S11

#### Read-in data
```{r}
####genome coverage####

unzip(zipfile = "data_genome_medicine/genome_medicine_TE_sequencing_experiment_readdepth_per_site_sample_and_virus_bowtie2_dedup_atcc_ref.tsv.zip", exdir = "data_genome_medicine/")

dedup_per_site <-
  read.table(
    "data_genome_medicine/genome_medicine_TE_sequencing_experiment_readdepth_per_site_sample_and_virus_bowtie2_dedup_atcc_ref.tsv",
    sep = "\t",
    header = TRUE
  )

file.remove("data_genome_medicine/genome_medicine_TE_sequencing_experiment_readdepth_per_site_sample_and_virus_bowtie2_dedup_atcc_ref.tsv")

#View(dedup_per_site)

counts_dedup_bt <-
  read.table(
    "data_TE/TE_sequencing_experiment_readcount_per_sample_and_virus_bowtie2_dedup_atcc_ref.tsv",
    sep = "\t",
    header = TRUE
  )

#add length column from read depth file to per site file

lengths <- 
  counts_dedup_bt |>
  select(virus,length) |>
  distinct()

#calculate genome coverage

coverage <- 
  dedup_per_site |>
  group_by(sample_id, virus, Viral.load, Replicate) |>
  filter(coverage > 0) |>
  summarise(genome_coverage = n()) |>
  left_join(lengths) |>
  mutate(percent_coverage = genome_coverage / length)

coverage |>
  filter(Viral.load != 0) |>
  ggplot(aes(x = virus, y = percent_coverage)) +
  geom_boxplot(outlier.shape = NA) +
  geom_point(position = position_dodge(width = .75)) +
  facet_grid( ~ as.character(Viral.load)) +
  theme_bw() +
  theme(axis.title.y = element_text(size = 10)) +
  ylab("Genome coverage") +
  theme_few() +
  theme(
    axis.title.x = element_blank(),
    axis.text.x = element_text(angle = 45, hjust = 1),
    axis.title.y = element_text(size = 15),
    axis.text = element_text(size = 12),
    strip.text = element_text(size = 12),
    legend.title = element_blank(),
    legend.position = "top",
    panel.grid.major = element_line(linewidth = 0.1, color = "gray60"),
    panel.grid.minor = element_line(linewidth = 0.1, color = "lightgray")
  ) +
  scale_y_continuous(limits = c(0, 1)) +
    scale_x_discrete(
    labels = c(
      "Human adenovirus 40",
      "Human betaherpesvirus",
      "Human respiratory syncytial virus",
      "Influenza B virus",
      "Mammalian orthoreovirus 3",
      "Zika virus"
    )
  )

#ggsave("/Users/laura/Dropbox/glasgow/github/te_ug_rodents/figures/genome_medicine/all_genome_coverage.png",width=14,height=6)

#ggsave("/Users/laura/Dropbox/glasgow/github/te_ug_rodents/figures/manuscript_figures_pdf/FigureS12.pdf",width=20,height=6)
```

#### *** Final plot ***
```{r}
#PDF
ggsave(
  "figures/manuscript_figure_2025/PDF/Figure_S11.pdf",
  width = 14,
  height = 6
)

#PNG
ggsave(
  "figures/manuscript_figure_2025/PNG/Figure_S11.png",
  width = 14,
  height = 6
)

```
### --------------------

### Figure S12
```{r, echo=FALSE, message=FALSE, warning=FALSE}
####per site plot AdV only####

dedup_per_site |>
  filter(virus == "Human_adenovirus_40") |>
  filter(Viral.load != 0) |>
  ggplot(aes(x=site,y=coverage))+
  geom_col()+
  facet_grid(Viral.load~Replicate)+
  ylab("Coverage per site")+
  ggtitle("Human Adenovirus (deduplicated)")+
  theme_few() +
  theme(
    # axis.title.x = element_text(size = 12),
    # # axis.text.x = element_text(angle = 30, hjust = 1),
    # axis.title.y = element_text(size = 12),
    # legend.title = element_blank(),
    # legend.position = "top",
    panel.grid.major = element_line(linewidth = 0.1, color = "gray60"),
    panel.grid.minor = element_line(linewidth = 0.1, color = "lightgray")
  ) +  
  scale_y_log10(labels = scales::trans_format("log10",
                                              scales::label_math())) +

  xlab("site position")

```

#### *** Final plot ***
```{r}
ggsave("figures/manuscript_figure_2025/PDF/Figure_S12.pdf",
       width=12,
       height=10)

ggsave("figures/manuscript_figure_2025/PNG/Figure_S12.png",
       width=12,
       height=10)
```
### .........................

### ----Kobuvirus / Cardiovirus results----

#### Figure S13

```{r}

kobu_depth <-
  read.table(
    "data_kobuvirus/kobuvirus_TE_polyomics_readdepth_20241007.tsv",
    sep = "\t",
    header = TRUE
  ) %>%
  select(!expt) %>%
  select(!Number.of.paired.end.reads..QT.)

cardio_depth <-
  read.table(
    "data_cardiovirus/cardiovirus_denovo_bowtie2_read_depth_per_sample_dedup.tsv",
    sep = ",",
    header = TRUE
  )

cardio_depth_poly <- 
  read.table(
    "data_cardiovirus/cardiovirus_denovo_bowtie2_readdepth_per_sample_dedup_polyomics.tsv",
    sep = ",",
    header = TRUE
  ) |>
  select(-contig_name) |>
  filter(Background %in% c("S18", "S19", "S43")) |>
  select(-Background) |>
  separate(Sample_id, into = c(NA, NA, "Background"), sep = "-", remove = FALSE)

depth_both <- rbind(kobu_depth, cardio_depth, cardio_depth_poly) |>
  mutate(Viral.load = ifelse(is.na(Viral.load), 0, Viral.load))

depth_both$type <- depth_both$type |>
  case_match("dedup_TE" ~ "dedup",
             "dedup_polyomics" ~ "dedup",
             .default = depth_both$type)

depth_both$Sample_id <- depth_both$Sample_id |>
  case_match("RNA-Msp-p2" ~ "M1",
             "RNA-Msp-p8" ~ "M2",
             "RNA-Msp-p6" ~ "M3",
             .default = depth_both$Sample_id
             )

depth_both$virus <- depth_both$virus |>
  case_match("cardiovirus" ~ "Cardiovirus",
             "k97_19971" ~ "Kobuvirus",
             .default = depth_both$virus
             )

cols2 <- c("#BB5566", "#004488")

coverage_labels <- c(
  "k97_19971" = "Kobuvirus",
  "cardiovirus" = "Cardiovirus",
  "p2" = "M1",
  "p8" = "M2"
)

```

#### *** Final plot ***
```{r}

depth_both$Viral.load <- factor(depth_both$Viral.load, labels = c("Shotgun",
    "10^{2}", 
    "10^{3}", 
    "10^{5}"
    ))


# depth_both |>
#   filter(Background != "p6") |>
#   filter(type == "dedup") |>
#   filter(mapper == "bowtie2") |>
#   filter(Viral.load != "NA") |>
#   ggplot(aes(
#     x = Viral.load,
#     y = (mean_depth),
#     colour = Background
#   )) +
#   geom_point() +
#  theme_few() +
#   theme(
#     axis.title.x = element_text(size = 12),
#     axis.title.y = element_text(size = 12),
#     legend.title = element_blank(),
#     legend.position = "none", 
#     panel.grid.major = element_line(linewidth=0.1, color = "gray60"),
#     panel.grid.minor = element_line(linewidth=0.1, color = "lightgray")
#   ) +  
#   facet_grid(Background ~ virus, labeller = as_labeller(coverage_labels)) +
#   scale_color_manual(values = cols2) +
#   ylab("Mean read depth") +
#     xlab("Spike-in viral load (gc/ml)") +
#    scale_x_log10(labels = scales::trans_format("log10",
#                                         scales::label_math())) + 
#   scale_y_log10(labels = scales::trans_format("log10",
#                                         scales::label_math()))

# replotting S13 in similar way to S14
depth_both %>%
  filter(Background != "p6") |>
  filter(type == "dedup") |>
  filter(mapper == "bowtie2") |>
  # filter(Viral.load != "NA") |>
  ggplot(aes(x = Sample_id, y = mean_depth, color = Background)) +
  geom_point() +
  ylab("Mean read depth") +
  xlab("Sample ID") +
  facet_grid(virus ~ Viral.load,
             scales = "free_x",
             label = label_parsed) +
  scale_color_manual(values = cols2, labels = c("M1", "M2")) +
 theme_few() +
  theme(
    axis.title.x = element_text(size = 12),
    axis.title.y = element_text(size = 12),
    legend.title = element_blank(),
    legend.position = "right", 
    panel.grid.major = element_line(linewidth=0.1, color = "gray60"),
    panel.grid.minor = element_line(linewidth=0.1, color = "lightgray")
  )+
  scale_y_log10(labels = scales::trans_format("log10",
                                        scales::label_math()))


ggsave("figures/manuscript_figure_2025/PDF/Figure_S13_v2.pdf",
       width=8,
       height=4)

ggsave("figures/manuscript_figure_2025/PNG/Figure_S13_v2.png",
       width=8,
       height=4)

```

### --------------------

#### Figure S14

```{r}

kobu_per_site <-
  read.table(
    "data_kobuvirus/kobuvirus_TE_polyomics_readdepth_persite_20241007.tsv",
    sep = "\t",
    header = TRUE
  ) %>%
  select(virus,
         seg,
         site,
         coverage,
         n_sites,
         Sample_id,
         Background,
         Viral.load,
         mapper,
         type)

kobu_per_site$Viral.load <- 
  kobu_per_site$Viral.load %>%
  replace_na(., 0)

kobu_per_site$Sample_id <- kobu_per_site$Sample_id %>%
  case_match("RNA-Msp-p2" ~ "M1",
             "RNA-Msp-p8" ~ "M2",
             .default = kobu_per_site$Sample_id)

kobu_per_site$Background <- kobu_per_site$Background %>%
  case_match("p2" ~ "M1",
             "p8" ~ "M2",
             .default = kobu_per_site$Background)

cardio_per_site <-
  read.table(
    "data_cardiovirus/cardiovirus_denovo_bowtie2_read_depth_per_site_and_sample_dedup.tsv",
    sep = ",",
    header = TRUE
  )

cardio_polyomics_per_site <-
  read.table(
    "data_cardiovirus/cardiovirus_denovo_bowtie2_readdepth_per_site_sample_dedup_polyomics.tsv",
    sep = ",",
    header = TRUE
  ) %>%
  select(!contig_name)

#View(cardio_polyomics_per_site)

cardio_TE_polyomics <-
  full_join(
    cardio_per_site,
    cardio_polyomics_per_site,
    by = c(
      "virus",
      "seg",
      "site",
      "coverage",
      "n_sites",
      "Sample_id",
      "Background",
      "Viral.load",
      "mapper",
      "type"
    )
  )

cardio_TE_polyomics$Viral.load <- cardio_TE_polyomics$Viral.load %>%
  replace_na(., 0)

drops <-
  c(
    "RNA-Msp-p1",
    "RNA-Msp-p3",
    "RNA-Msp-p4",
    "RNA-Msp-p5",
    "RNA-Msp-p6",
    "RNA-Msp-p7",
    "RNA-Msp-p9"
  )

cardio_TE_polyomics <- cardio_TE_polyomics %>%
  filter(!Sample_id %in% drops)

cardio_TE_polyomics$Sample_id <- cardio_TE_polyomics$Sample_id %>%
  case_match("RNA-Msp-p2" ~ "M1",
             "RNA-Msp-p8" ~ "M2",
             .default = cardio_TE_polyomics$Sample_id)

cardio_TE_polyomics$Background <- cardio_TE_polyomics$Background %>%
  case_match("S18" ~ "M1",
             "S43" ~ "M2",
             "p2" ~ "M1",
             "p8" ~ "M2",
             .default = cardio_TE_polyomics$Background)

persite_both <- rbind(kobu_per_site, cardio_TE_polyomics)

persite_both$type <- persite_both$type %>%
  case_match("dedup_TE" ~ "dedup",
             "dedup_polyomics" ~ "dedup",
             .default = persite_both$type)

#TE data only - compare coverage between backgrounds/viral loads

cols2 <- c("#BB5566", "#004488")

coverage_labels <- c(
  "100" = "10^{2}",
  "1000" = "10^{3}",
  "100000" = "10^{5}",
  "0" = "Shotgun",
  "Kobuvirus" = "Kobuvirus",
  "Cardiovirus" = "Cardiovirus"
)

coverage_labels2 <- c(
  "100" = "10^{2}",
  "1000" = "10^{3}",
  "100000" = "10^{5}",
  "0" = "Shotgun",
  "A" = "A",
  "B" = "B",
  "C" = "C",
  "D" = "D",
  "F" = "F",
  "G" = "G",
  "H" = "H",
  "I" = "I",
  "J" = "J",
  "K" = "K",
  "L" = "L",
  "M" = "M",
  "N" = "N",
  "O" = "O",
  "P" = "P",
  "ME_P1" = "M1",
  "ME_P2" = "M2"
)

kobu_coverage <- kobu_per_site %>%
  filter(mapper == "bowtie2") %>%
  filter(type == "dedup") %>%
  filter(Background != "p6") %>%
  group_by(Sample_id, Viral.load, Background) %>%
  filter(coverage > 0) %>%
  summarise(genome_coverage = n()) %>%
  mutate(percent_coverage = genome_coverage / 8467) %>%
  mutate(virus = "Kobuvirus")

cardio_coverage <- cardio_TE_polyomics %>%
  filter(Background != "p6") %>%
  group_by(Sample_id, Viral.load, Background) %>%
  filter(coverage > 0) %>%
  summarise(genome_coverage = n()) %>%
  mutate(percent_coverage = genome_coverage / 6945) %>%
  mutate(virus = "Cardiovirus")

coverage_both <- rbind(kobu_coverage, cardio_coverage)

coverage_both$Viral.load <- factor(coverage_both$Viral.load, labels = c("Shotgun",
    "10^{2}", 
    "10^{3}", 
    "10^{5}"
    ))

```

#### *** Final plot ***
```{r}

coverage_both %>%
  ggplot(aes(x = Sample_id, y = percent_coverage, color = Background)) +
  geom_point() +
  ylab("Genome coverage") +
  xlab("Sample ID") +
  facet_grid(virus ~ (Viral.load),
             scales = "free_x",
             label = label_parsed) +
  scale_color_manual(values = cols2, labels = c("M1", "M2")) +
 theme_few() +
  theme(
    axis.title.x = element_text(size = 12),
    axis.title.y = element_text(size = 12),
    legend.title = element_blank(),
    legend.position = "right", 
    panel.grid.major = element_line(linewidth=0.1, color = "gray60"),
    panel.grid.minor = element_line(linewidth=0.1, color = "lightgray")
  ) + 
  scale_y_continuous(limits = c(0, 1))

#PDF
ggsave("figures/manuscript_figure_2025/PDF/Figure_S14.pdf", 
       width = 8, 
       height = 4)

#PNG
ggsave("figures/manuscript_figure_2025/PNG/Figure_S14.png", 
       width = 8, 
       height = 4)

```



### --------------------

#### Figure S15

```{r, echo=FALSE, warning=FALSE, message=FALSE}

#per site genome coverage - Kobu

persite_both$Viral.load <- factor(persite_both$Viral.load, labels = c("Shotgun",
    "10^{2}", 
    "10^{3}", 
    "10^{5}"
    ))

persite_both$Viral.load <- factor(persite_both$Viral.load, levels = c(
    "10^{2}", 
    "10^{3}", 
    "10^{5}",
    "Shotgun"
    ))

persite_both |>
  filter(virus == "k97_19971") |>
  filter(mapper == "bowtie2") |>
  filter(type == "dedup") |>
  filter(Background != "p6") |>
  ggplot(aes(
    x = site,
    y = (coverage),
    fill = Background
  )) +
  geom_col() +
  theme_few() +
  theme(
    axis.title.x = element_text(size = 12),
    axis.title.y = element_text(size = 12),
    legend.title = element_blank(),
    legend.position = "right",
    panel.grid.major = element_line(linewidth = 0.1, color = "gray60"),
    panel.grid.minor = element_line(linewidth = 0.1, color = "lightgray")
  ) +   
  facet_wrap(Viral.load ~ Sample_id, labeller = label_parsed) +
  scale_fill_manual(values = cols2, labels = c("M1", "M2")) +
  ylab("Coverage") +
  xlab("") +
  ggtitle("Kobuvirus") +
  scale_y_log10(labels = scales::trans_format("log10",
                                              scales::label_math()))

```

#### *** Final plot ***
```{r}
#PDF
ggsave("figures/manuscript_figure_2025/PDF/Figure_S15.pdf",
       width=12,
       height=7)

#PNG
ggsave("figures/manuscript_figure_2025/Png/Figure_S15.png",
       width=12,
       height=7)

```

#### Figure S16
```{r, echo=FALSE, warning=FALSE, message=FALSE}

#per site genome coverage - Cardio

persite_both |>
  filter(virus == "cardiovirus") |>
  filter(mapper == "bowtie2") |>
  filter(type == "dedup") |>
  filter(Background != "p6") |>
  ggplot(aes(
    x = site,
    y = coverage,
    fill = Background
  )) +
  geom_col() +
  facet_wrap(Viral.load ~ Sample_id, labeller = label_parsed) +
  scale_fill_manual(values = cols2, labels = c("M1", "M2")) +
  theme_few() +
  theme(
    axis.title.x = element_text(size = 12),
    axis.title.y = element_text(size = 12),
    legend.title = element_blank(),
    legend.position = "right",
    panel.grid.major = element_line(linewidth = 0.1, color = "gray60"),
    panel.grid.minor = element_line(linewidth = 0.1, color = "lightgray")
  ) +
  ylab("Coverage") +
  xlab("") +
  ggtitle("Cardiovirus") +
  scale_y_log10(labels = scales::trans_format("log10",
                                              scales::label_math()))
```

#### *** Final plot ***
```{r}

#PDF
ggsave("figures/manuscript_figure_2025/PDF/Figure_S16.pdf",
       width=12,
       height=7)


#PNG
ggsave("figures/manuscript_figure_2025/PNG/Figure_S16.png",
       width=12,
       height=7)
```

